Beispiel #1
0
    def _delete_spiked_events(self, expiry_datetime):
        logger.info('{} Starting to delete spiked events'.format(self.log_msg))
        events_service = get_resource_service('events')

        events_deleted = set()
        series_to_delete = dict()

        # Obtain the full list of Events that we're to process first
        # As subsequent queries will change the list of returned items
        events = dict()
        for items in events_service.get_expired_items(expiry_datetime,
                                                      spiked_events_only=True):
            events.update({item[config.ID_FIELD]: item for item in items})

        for event_id, event in events.items():
            if event.get('recurrence_id'
                         ) and event['recurrence_id'] not in series_to_delete:
                spiked, events = self.is_series_expired_and_spiked(
                    event, expiry_datetime)
                if spiked:
                    series_to_delete[event['recurrence_id']] = events
            else:
                events_service.delete_action(lookup={'_id': event_id})
                events_deleted.add(event_id)

        # Delete recurring series
        for recurrence_id, events in series_to_delete.items():
            events_service.delete_action(
                lookup={'recurrence_id': recurrence_id})
            events_deleted.add(events)

        logger.info('{} {} Events deleted: {}'.format(self.log_msg,
                                                      len(events_deleted),
                                                      list(events_deleted)))
Beispiel #2
0
def routing(item, desk=None, **kwargs):
    if desk is None:
        desk_id = item.get("task", {}).get("desk")
        if desk_id:
            desk = get_resource_service("desks").find_one(req=None,
                                                          _id=desk_id)
    dest = get_destination_desk(desk)
    if dest and str(desk["_id"]) != str(dest["_id"]):
        logger.info('auto-routing item "%s" from desk "%s" to "%s"',
                    item.get("headline"), desk.get("name"), dest.get("name"))
        try:
            marked_desks = item.get("marked_desks", [])
            existing = [
                mark for mark in marked_desks
                if str(mark["desk_id"]) == str(dest["_id"])
            ]
            if not existing:
                marked_desks.append({
                    "desk_id": str(dest["_id"]),
                    "date_marked": utcnow(),
                })
                item["marked_desks"] = marked_desks
        except Exception:
            logger.exception("auto-routing error")
    return item
Beispiel #3
0
def remove_media_files(doc):
    """Removes the media files of the given doc.

    If media files are not references by any other
    story then delete the media files
    :param dict doc: document for which the media are being deleted
    :return boolean: True if files are deleted else false.
    """
    logger.info('Removing Media Files...')
    references = None

    if doc.get('renditions'):
        references = [doc.get('renditions')]

    if not references:
        references = [assoc.get('renditions') for assoc in (doc.get(ASSOCIATIONS) or {}).values()
                      if assoc and assoc.get('renditions')]

    for renditions in references:
        for rendition in renditions.values():
            media = rendition.get('media') if isinstance(rendition.get('media'), str) else str(rendition.get('media'))
            try:
                references = get_resource_service('media_references').get(req=None, lookup={
                    'media_id': media, 'published': True
                })

                if references.count() == 0:
                    logger.info('Deleting media:{}'.format(rendition.get('media')))
                    app.media.delete(media)
            except Exception:
                logger.exception('Failed to remove Media Id: {} from item: {}'.format(media, doc.get(config.ID_FIELD)))
def remove_media_files(doc):
    """Removes the media files of the given doc.

    If media files are not references by any other
    story then delete the media files
    :param dict doc: document for which the media are being deleted
    :return boolean: True if files are deleted else false.
    """
    logger.info('Removing Media Files...')
    references = None

    if doc.get('renditions'):
        references = [doc.get('renditions')]

    if not references:
        references = [assoc.get('renditions') for assoc in (doc.get(ASSOCIATIONS) or {}).values()
                      if assoc and assoc.get('renditions')]

    for renditions in references:
        for rendition in renditions.values():
            media = rendition.get('media') if isinstance(rendition.get('media'), str) else str(rendition.get('media'))
            try:
                references = get_resource_service('media_references').get(req=None, lookup={
                    'media_id': media, 'published': True
                })

                if references.count() == 0:
                    logger.info('Deleting media:{}'.format(rendition.get('media')))
                    app.media.delete(media)
            except Exception:
                logger.exception('Failed to remove Media Id: {} from item: {}'.format(media, doc.get(config.ID_FIELD)))

    for attachment in doc.get('attachments', []):
        lookup = {'_id': attachment['attachment']}
        get_resource_service('attachments').delete_action(lookup)
Beispiel #5
0
def routing(item, desk=None, **kwargs):
    if desk is None:
        desk_id = item.get('task', {}).get('desk')
        if desk_id:
            desk = get_resource_service('desks').find_one(req=None,
                                                          _id=desk_id)
    dest = get_destination_desk(desk)
    if dest and str(desk['_id']) != str(dest['_id']):
        logger.info('auto-routing item "%s" from desk "%s" to "%s"',
                    item.get('headline'), desk.get('name'), dest.get('name'))
        try:
            marked_desks = item.get('marked_desks', [])
            existing = [
                mark for mark in marked_desks
                if str(mark['desk_id']) == str(dest['_id'])
            ]
            if not existing:
                marked_desks.append({
                    'desk_id': str(dest['_id']),
                    'date_marked': utcnow(),
                })
                item['marked_desks'] = marked_desks
        except Exception:
            logger.exception('auto-routing error')
    return item
Beispiel #6
0
    def find_one(self, endpoint_name, req, **lookup):
        """Find single item.

        :param endpoint_name: resource name
        :param req: parsed request
        :param lookup: additional filter
        """
        backend = self._backend(endpoint_name)
        item = backend.find_one(endpoint_name, req=req, **lookup)
        search_backend = self._lookup_backend(endpoint_name, fallback=True)
        if search_backend:
            # set the parent for the parent child in elastic search
            self._set_parent(endpoint_name, item, lookup)
            item_search = search_backend.find_one(endpoint_name, req=req, **lookup)
            if item is None and item_search:
                item = item_search
                logger.warn(item_msg('item is only in elastic', item))
            elif item_search is None and item:
                logger.warn(item_msg('item is only in mongo', item))
                try:
                    logger.info(item_msg('trying to add item to elastic', item))
                    search_backend.insert(endpoint_name, [item])
                except RequestError as e:
                    logger.error(item_msg('failed to add item into elastic error={}'.format(str(e)), item))
        return item
Beispiel #7
0
    def find_one(self, endpoint_name, req, **lookup):
        """Find single item.

        :param endpoint_name: resource name
        :param req: parsed request
        :param lookup: additional filter
        """
        backend = self._backend(endpoint_name)
        item = backend.find_one(endpoint_name, req=req, **lookup)
        search_backend = self._lookup_backend(endpoint_name, fallback=True)
        if search_backend:
            # set the parent for the parent child in elastic search
            self._set_parent(endpoint_name, item, lookup)
            item_search = search_backend.find_one(endpoint_name,
                                                  req=req,
                                                  **lookup)
            if item is None and item_search:
                item = item_search
                logger.warn(item_msg('item is only in elastic', item))
            elif item_search is None and item:
                logger.warn(item_msg('item is only in mongo', item))
                try:
                    logger.info(item_msg('trying to add item to elastic',
                                         item))
                    search_backend.insert(endpoint_name, [item])
                except RequestError as e:
                    logger.error(
                        item_msg(
                            'failed to add item into elastic error={}'.format(
                                str(e)), item))
        return item
Beispiel #8
0
    def delete(self, endpoint_name, lookup):
        """Delete method to delete by using mongo query syntax.

        :param endpoint_name: Name of the endpoint
        :param lookup: User mongo query syntax. example 1. ``{'_id':123}``, 2. ``{'item_id': {'$in': [123, 234]}}``
        :returns: Returns the mongo remove command response. {'n': 12, 'ok': 1}
        """
        backend = self._backend(endpoint_name)
        search_backend = self._lookup_backend(endpoint_name)
        docs = self.get_from_mongo(endpoint_name, lookup=lookup, req=ParsedRequest())
        ids = [doc[config.ID_FIELD] for doc in docs]
        removed_ids = ids
        logger.info("total documents to be removed {}".format(len(ids)))
        if search_backend and ids:
            removed_ids = []
            # first remove it from search backend, so it won't show up. when this is done - remove it from mongo
            for _id in ids:
                try:
                    self.remove_from_search(endpoint_name, _id)
                    removed_ids.append(_id)
                except NotFoundError:
                    logger.warning('item missing from elastic _id=%s' % (_id, ))
                    removed_ids.append(_id)
                except:
                    logger.exception('item can not be removed from elastic _id=%s' % (_id, ))
        backend.remove(endpoint_name, {config.ID_FIELD: {'$in': removed_ids}})
        logger.info("Removed {} documents from {}.".format(len(ids), endpoint_name))
        if not ids:
            logger.warn("No documents for {} resource were deleted using lookup {}".format(endpoint_name, lookup))
Beispiel #9
0
 def delete_docs(self, endpoint_name, docs):
     """Delete using list of documents."""
     backend = self._backend(endpoint_name)
     search_backend = self._lookup_backend(endpoint_name)
     ids = [doc[config.ID_FIELD] for doc in docs]
     removed_ids = ids
     logger.info("total documents to be removed {}".format(len(ids)))
     if search_backend and ids:
         removed_ids = []
         # first remove it from search backend, so it won't show up. when this is done - remove it from mongo
         for doc in docs:
             try:
                 self.remove_from_search(endpoint_name, doc)
                 removed_ids.append(doc[config.ID_FIELD])
             except NotFoundError:
                 logger.warning('item missing from elastic _id=%s' %
                                (doc[config.ID_FIELD], ))
                 removed_ids.append(doc[config.ID_FIELD])
             except Exception:
                 logger.exception(
                     'item can not be removed from elastic _id=%s' %
                     (doc[config.ID_FIELD], ))
     if len(removed_ids):
         backend.remove(endpoint_name,
                        {config.ID_FIELD: {
                            '$in': removed_ids
                        }})
         logger.info("Removed %d documents from %s.", len(removed_ids),
                     endpoint_name)
     else:
         logger.warn("No documents for %s resource were deleted.",
                     endpoint_name)
     return removed_ids
Beispiel #10
0
def ping_scanpix(assoc, item):
    for key in ['OWNER', 'USERNAME', 'PASSWORD']:
        if not app.config.get('SCANPIX_PING_%s' % key):
            return
    try:
        res = http.post(
            SCANPIX_PING_URL,
            json.dumps({
                'type': 'articleUsage',
                'data': {
                    'owner':
                    app.config['SCANPIX_PING_OWNER'],
                    'media_id':
                    assoc.get('guid', assoc.get('_id')),
                    'article_id':
                    item.get('guid', item.get('_id')),
                    'services':
                    [cat.get('name') for cat in item.get('anpa_category', [])],
                },
            }),
            headers={'content-type': 'application/json'},
            auth=(app.config['SCANPIX_PING_USERNAME'],
                  app.config['SCANPIX_PING_PASSWORD']),
            timeout=PING_TIMEOUT,
        )
        logger.info('scanpix image published status=%d image=%s article=%s',
                    res.status_code, assoc.get('guid',
                                               ''), item.get('guid', ''))
    except Exception as e:
        logger.exception(e)
Beispiel #11
0
    def delete(self, endpoint_name, lookup):
        """Delete method to delete by using mongo query syntax.

        :param endpoint_name: Name of the endpoint
        :param lookup: User mongo query syntax. example 1. ``{'_id':123}``, 2. ``{'item_id': {'$in': [123, 234]}}``
        :returns: Returns the mongo remove command response. {'n': 12, 'ok': 1}
        """
        backend = self._backend(endpoint_name)
        search_backend = self._lookup_backend(endpoint_name)
        docs = self.get_from_mongo(endpoint_name, lookup=lookup, req=ParsedRequest())
        ids = [doc[config.ID_FIELD] for doc in docs]
        removed_ids = ids
        logger.info("total documents to be removed {}".format(len(ids)))
        if search_backend and ids:
            removed_ids = []
            # first remove it from search backend, so it won't show up. when this is done - remove it from mongo
            for _id in ids:
                try:
                    self.remove_from_search(endpoint_name, _id)
                    removed_ids.append(_id)
                except NotFoundError:
                    logger.warning('item missing from elastic _id=%s' % (_id, ))
                    removed_ids.append(_id)
                except:
                    logger.exception('item can not be removed from elastic _id=%s' % (_id, ))
        backend.remove(endpoint_name, {config.ID_FIELD: {'$in': removed_ids}})
        logger.info("Removed {} documents from {}.".format(len(ids), endpoint_name))
        if not ids:
            logger.warn("No documents for {} resource were deleted using lookup {}".format(endpoint_name, lookup))
Beispiel #12
0
def remove_locks():
    """
    Removes item related locks that are not in use
    :return:
    """
    result = _lock.collection.delete_many({'$or': [{'_id': re.compile('^item_move'), 'locked': False},
                                          {'_id': re.compile('^item_lock'), 'locked': False}]})
    logger.info('unused item locks deleted count={}'.format(result.deleted_count))
Beispiel #13
0
def remove_locks():
    """
    Removes item related locks that are not in use
    :return:
    """
    result = _lock.collection.delete_many({'$or': [{'_id': re.compile('^item_move'), 'locked': False},
                                          {'_id': re.compile('^item_lock'), 'locked': False}]})
    logger.info('unused item locks deleted count={}'.format(result.deleted_count))
Beispiel #14
0
def init_app(app):
    item_publish.connect(publish_scanpix)
    if app.config.get('SCANPIX_PING_OWNER') and app.config.get(
            'SCANPIX_PING_USERNAME'):
        logger.info('SCANPIX ping owner configured %s',
                    app.config['SCANPIX_PING_OWNER'])
    else:
        logger.info('SCANPIX ping owner not set')
 def transmit(self, queue_item):
     try:
         self._transmit(queue_item, None)
         logger.info('Successfully transmitted item {}'.format(
             queue_item.get('item_id')))
     except Exception:
         logger.exception("Failed to transmit the item {}.".format(
             queue_item.get('item_id')))
Beispiel #16
0
    def _update(self, provider, update):
        updated = utcnow()

        last_updated = provider.get('last_updated')
        ttl_minutes = app.config['INGEST_EXPIRY_MINUTES']
        if not last_updated or last_updated < updated - datetime.timedelta(
                minutes=ttl_minutes):
            last_updated = updated - datetime.timedelta(minutes=ttl_minutes)

        self.provider = provider
        provider_config = provider.get('config')
        if not provider_config:
            provider_config = {}
            provider['config'] = provider_config

        self.URL = provider_config.get('url')
        payload = {}

        parser = self.get_feed_parser(provider)

        try:
            response = requests.get(self.URL, params=payload, timeout=15)
            # TODO: check if file has been updated since provider last_updated
            # although some ptovider do not include 'Last-Modified' in headers
            # so unsure how to do this
            logger.info('Http Headers: %s', response.headers)
        except requests.exceptions.Timeout as ex:
            # Maybe set up for a retry, or continue in a retry loop
            raise IngestApiError.apiTimeoutError(ex, self.provider)
        except requests.exceptions.TooManyRedirects as ex:
            # Tell the user their URL was bad and try a different one
            raise IngestApiError.apiRedirectError(ex, self.provider)
        except requests.exceptions.RequestException as ex:
            # catastrophic error. bail.
            raise IngestApiError.apiRequestError(ex, self.provider)
        except Exception as error:
            traceback.print_exc()
            raise IngestApiError.apiGeneralError(error, self.provider)

        if response.status_code == 404:
            raise LookupError('Not found %s' % payload)

        logger.info('Ingesting: %s', str(response.content))

        if isinstance(parser, NTBEventXMLFeedParser):
            xml = ET.fromstring(response.content)
            items = parser.parse(xml, provider)
        elif isinstance(parser, IcsTwoFeedParser):
            cal = Calendar.from_ical(response.content)
            items = parser.parse(cal, provider)
        else:
            items = parser.parse(response.content)

        if isinstance(items, list):
            yield items
        else:
            yield [items]
Beispiel #17
0
def unlock(task, host):
    """Release lock on given task.

    Lock can be only released by host which locked it.

    :param task: task name
    :param host: current host id
    """
    logger.info('releasing lock task=%s host=%s' % (task, host))
    return _lock.release(task, host)
Beispiel #18
0
 def get_ids(self, channel, last_updated, updated):
     """Get ids of documents which should be updated."""
     ids = set()
     payload = {'channel': channel, 'fieldsRef': 'id'}
     payload['dateRange'] = "%s-%s" % (self.format_date(last_updated), self.format_date(updated))
     logger.info('Reuters requesting Date Range |{}| for channel {}'.format(payload['dateRange'], channel))
     tree = self.get_tree('items', payload)
     for result in tree.findall('result'):
         ids.add(result.find('guid').text)
     return ids
Beispiel #19
0
    def _update(self, provider, update):
        self.provider = provider
        parser = self.get_feed_parser(provider)

        # get the current year, it is used to filter fixtures for this year and next
        year = int(utcnow().year) % 100
        config = provider.get('config', {})
        content = self._request(
            config.get('login_url').format(config.get('username'),
                                           config.get('password')))
        # get the configured sports
        configured_sports = config.get('sports').split(',')
        xml = ET.fromstring(content)
        if xml.attrib['Status_Code'] == 'OK':
            session = xml.attrib['Status_Session']
            content = self._request(
                config.get('fixtures_url').format(session, '', '', ''))
            xml = ET.fromstring(content)
            for s in xml.findall('.//Sports/Sport'):
                sport_id = s.attrib['SportID']
                if sport_id not in configured_sports:
                    continue
                sport_name = s.attrib['SportName']
                content = self._request(
                    config.get('fixtures_url').format(session, sport_id, '',
                                                      ''))
                sport_xml = ET.fromstring(content)
                for c in sport_xml.findall('.//Competition'):
                    comp_id = c.attrib.get('Comp_ID')
                    comp_name = c.attrib.get('Comp_Name')
                    content = self._request(
                        config.get('fixtures_url').format(
                            session, sport_id, comp_id, ''))
                    comp_xml = ET.fromstring(content)
                    for season in comp_xml.findall('.//Season'):
                        season_id = season.attrib.get('SeasonID')
                        if str(year) in season_id or str(year +
                                                         1) in season_id:
                            content = self._request(
                                config.get('fixtures_url').format(
                                    session, sport_id, comp_id, season_id))
                            fixture_xml = ET.fromstring(content)
                            logger.info('Parsing {}/{} {}/{}'.format(
                                sport_id, sport_name, comp_id, comp_name))
                            items = parser.parse(
                                {
                                    'fixture_xml': fixture_xml,
                                    'sport_id': sport_id,
                                    'sport_name': sport_name,
                                    'comp_name': comp_name,
                                    'comp_id': comp_id
                                }, provider)
                            if len(items) > 0:
                                yield items
Beispiel #20
0
    def run(self):
        now = utcnow()
        self.log_msg = 'Delete Spiked Items Time: {}.'.format(now)
        logger.info('{} Starting to delete spiked items at.'.format(
            self.log_msg))

        expire_interval = app.config.get('PLANNING_DELETE_SPIKED_MINUTES', 0)
        if expire_interval == 0:
            logger.info(
                '{} PLANNING_DELETE_SPIKED_MINUTES=0, not spiking any items')
            return

        lock_name = get_lock_id('planning', 'delete_spiked')
        if not lock(lock_name, expire=610):
            logger.info(
                '{} Delete spiked items task is already running'.format(
                    self.log_msg))
            return

        expiry_datetime = now - timedelta(minutes=expire_interval)

        try:
            self._delete_spiked_events(expiry_datetime)
        except Exception as e:
            logger.exception(e)

        try:
            self._delete_spiked_planning(expiry_datetime)
        except Exception as e:
            logger.exception(e)

        unlock(lock_name)

        logger.info('{} Completed deleting spiked items.'.format(self.log_msg))
        remove_locks()
Beispiel #21
0
def remove_expired_data(provider):
    """Remove expired data for provider"""
    logger.info('Removing expired content for provider: %s' %
                provider.get('_id', 'Detached items'))
    ingest_service = superdesk.get_resource_service('ingest')

    items = get_expired_items(provider)

    ids = [item['_id'] for item in items]
    items.rewind()
    file_ids = [
        rend.get('media') for item in items
        for rend in item.get('renditions', {}).values()
        if not item.get('archived') and rend.get('media')
    ]

    if ids:
        logger.info('Removing items %s' % ids)
        ingest_service.delete({'_id': {'$in': ids}})
        push_expired_notification(ids)

    for file_id in file_ids:
        logger.info('Deleting file: %s' % file_id)
        superdesk.app.media.delete(file_id)

    stats.incr('ingest.expired_items', len(ids))
    logger.info('Removed expired content for provider: {0} count: {1}'.format(
        provider.get('_id', 'Detached items'), len(ids)))

    remove_expired_from_elastic()
Beispiel #22
0
def lock(task, host, expire=300, timeout=None):
    """Try to lock task.

    :param task: task name
    :param host: current host id
    :param expire: lock ttl in seconds
    :param timeout: how long should it wait if task is locked
    """
    got_lock = _lock.lock(task, host, expire=expire, timeout=timeout)
    if got_lock:
        logger.info('got lock task=%s host=%s' % (task, host))
    else:
        logger.info('task locked already task=%s host=%s' % (task, host))
    return got_lock
Beispiel #23
0
 def _remove_documents_from_search_backend(self, endpoint_name, ids):
     """
     remove documents from search backend.
     :param endpoint_name: name of the endpoint
     :param ids: list of ids
     """
     ids = [str(doc_id) for doc_id in ids]
     batch_size = 500
     logger.info("total documents to be removed {}".format(len(ids)))
     for i in range(0, len(ids), batch_size):
         batch = ids[i:i + batch_size]
         query = {'query': {'terms': {'{}._id'.format(endpoint_name): batch}}}
         app.data._search_backend(endpoint_name).remove(endpoint_name, query)
         logger.info("Removed {} documents from {}.".format(len(batch), endpoint_name))
def remove_expired_data(provider):
    """Remove expired data for provider"""
    logger.info('Removing expired content for provider: %s' % provider.get('_id', 'Detached items'))

    try:
        feeding_service = registered_feeding_services[provider['feeding_service']]
        feeding_service = feeding_service.__class__()
        ingest_collection = feeding_service.service if hasattr(feeding_service, 'service') else 'ingest'
    except KeyError:
        ingest_collection = 'ingest'

    ingest_service = superdesk.get_resource_service(ingest_collection)

    items = get_expired_items(provider, ingest_collection)

    ids = [item['_id'] for item in items]
    items.rewind()
    file_ids = [rend.get('media')
                for item in items
                for rend in item.get('renditions', {}).values()
                if not item.get('archived') and rend.get('media')]

    if ids:
        logger.info('Removing items %s' % ids)
        ingest_service.delete({'_id': {'$in': ids}})
        push_expired_notification(ids)

    for file_id in file_ids:
        logger.info('Deleting file: %s' % file_id)
        superdesk.app.media.delete(file_id)

    logger.info('Removed expired content for provider: {0} count: {1}'
                .format(provider.get('_id', 'Detached items'), len(ids)))

    remove_expired_from_elastic(ingest_collection)
Beispiel #25
0
 def _remove_documents_from_search_backend(self, endpoint_name, ids):
     """
     remove documents from search backend.
     :param endpoint_name: name of the endpoint
     :param ids: list of ids
     """
     ids = [str(doc_id) for doc_id in ids]
     batch_size = 500
     logger.info("total documents to be removed {}".format(len(ids)))
     for i in range(0, len(ids), batch_size):
         batch = ids[i:i + batch_size]
         query = {'query': {'terms': {'{}._id'.format(endpoint_name): batch}}}
         app.data._search_backend(endpoint_name).remove(endpoint_name, query)
         logger.info("Removed {} documents from {}.".format(len(batch), endpoint_name))
def remove_expired_data(provider):
    """Remove expired data for provider"""
    logger.info('Removing expired content for provider: %s' % provider.get('_id', 'Detached items'))
    ingest_service = superdesk.get_resource_service('ingest')

    items = get_expired_items(provider)

    ids = [item['_id'] for item in items]
    items.rewind()
    file_ids = [rend.get('media')
                for item in items
                for rend in item.get('renditions', {}).values()
                if not item.get('archived') and rend.get('media')]

    if ids:
        logger.info('Removing items %s' % ids)
        ingest_service.delete({'_id': {'$in': ids}})
        push_expired_notification(ids)

    for file_id in file_ids:
        logger.info('Deleting file: %s' % file_id)
        superdesk.app.media.delete(file_id)

    stats.incr('ingest.expired_items', len(ids))
    logger.info('Removed expired content for provider: {0} count: {1}'
                .format(provider.get('_id', 'Detached items'), len(ids)))

    remove_expired_from_elastic()
Beispiel #27
0
    def _get_article_ids(self, channel, last_updated, updated):
        """
        Get article ids which should be upserted.
        """

        ids = set()
        payload = {'channel': channel, 'fieldsRef': 'id',
                   'dateRange': "%s-%s" % (self._format_date(last_updated), self._format_date(updated))}

        logger.info('Reuters requesting Date Range |{}| for channel {}'.format(payload['dateRange'], channel))
        tree = self._get_tree('items', payload)
        for result in tree.findall('result'):
            ids.add(result.find('guid').text)

        return ids
Beispiel #28
0
    def _remove_expired_published_planning():
        """Expire planning versions

        Expiry of the planning versions mirrors the expiry of items within the publish queue in Superdesk so it uses the
        same configuration value

        :param self:
        :return:
        """
        expire_interval = app.config.get('PUBLISH_QUEUE_EXPIRY_MINUTES', 0)
        if expire_interval:
            expire_time = utcnow() - timedelta(minutes=expire_interval)
            logger.info('Removing planning history items created before {}'.format(str(expire_time)))

            get_resource_service('published_planning').delete({'_id': {'$lte': ObjectId.from_datetime(expire_time)}})
Beispiel #29
0
    def _flag_expired_events(self, expiry_datetime):
        logger.info('{} Starting to flag expired events'.format(self.log_msg))
        events_service = get_resource_service('events')
        planning_service = get_resource_service('planning')

        locked_events = set()
        events_in_use = set()
        events_expired = set()
        plans_expired = set()

        # Obtain the full list of Events that we're to process first
        # As subsequent queries will change the list of returned items
        events = dict()
        for items in events_service.get_expired_items(expiry_datetime):
            events.update({item[config.ID_FIELD]: item for item in items})

        self._set_event_plans(events)

        for event_id, event in events.items():
            if event.get('lock_user'):
                locked_events.add(event_id)
            elif self._get_event_schedule(event) > expiry_datetime:
                events_in_use.add(event_id)
            else:
                events_expired.add(event_id)
                events_service.system_update(event_id, {'expired': True}, event)
                for plan in event.get('_plans', []):
                    plan_id = plan[config.ID_FIELD]
                    planning_service.system_update(plan_id, {'expired': True}, plan)
                    plans_expired.add(plan_id)

        if len(locked_events) > 0:
            logger.info('{} Skipping {} locked Events: {}'.format(
                self.log_msg,
                len(locked_events),
                list(locked_events)
            ))

        if len(events_in_use) > 0:
            logger.info('{} Skipping {} Events in use: {}'.format(
                self.log_msg,
                len(events_in_use),
                list(events_in_use)
            ))

        if len(events_expired) > 0:
            push_notification(
                'events:expired',
                items=list(events_expired)
            )

        if len(plans_expired) > 0:
            push_notification(
                'planning:expired',
                items=list(plans_expired)
            )

        logger.info('{} {} Events expired: {}'.format(self.log_msg, len(events_expired), list(events_expired)))
Beispiel #30
0
    def run(self, now=None):
        if now:
            now_utc = now if isinstance(now, datetime) else local_to_utc(
                app.config['DEFAULT_TIMEZONE'],
                datetime.strptime(now, '%Y-%m-%dT%H'))
        else:
            now_utc = utcnow()

        now_local = utc_to_local(app.config['DEFAULT_TIMEZONE'], now_utc)

        logger.info('Starting to send scheduled reports: {}'.format(now_utc))

        schedules = self.get_schedules()

        if len(schedules) < 1:
            logger.info('No enabled schedules found, not continuing')
            return

        # Set now to the beginning of the hour (in local time)
        now_local = now_local.replace(minute=0, second=0, microsecond=0)

        for scheduled_report in schedules:
            schedule_id = str(scheduled_report.get('_id'))

            try:
                if not self.should_send_report(scheduled_report, now_local):
                    logger.info(
                        'Scheduled Report {} not scheduled to be sent'.format(
                            schedule_id))
                    continue

                logger.info('Attempting to send Scheduled Report {}'.format(
                    schedule_id))
                self._send_report(scheduled_report)

                # Update the _last_sent of the schedule
                get_resource_service('scheduled_reports').system_update(
                    scheduled_report.get('_id'), {'_last_sent': now_utc},
                    scheduled_report)
            except Exception as e:
                logger.error(
                    'Failed to generate report for {}. Error: {}'.format(
                        schedule_id, str(e)))
                logger.exception(e)

        logger.info('Completed sending scheduled reports: {}'.format(now_utc))
Beispiel #31
0
 def find_one(self, endpoint_name, req, **lookup):
     backend = self._backend(endpoint_name)
     item = backend.find_one(endpoint_name, req=req, **lookup)
     search_backend = self._lookup_backend(endpoint_name, fallback=True)
     if search_backend:
         item_search = search_backend.find_one(endpoint_name, req=req, **lookup)
         if item is None and item_search:
             item = item_search
             logger.warn(item_msg('item is only in elastic', item))
         elif item_search is None and item:
             logger.warn(item_msg('item is only in mongo', item))
             try:
                 logger.info(item_msg('trying to add item to elastic', item))
                 search_backend.insert(endpoint_name, [item])
             except RequestError as e:
                 logger.error(item_msg('failed to add item into elastic error={}'.format(str(e)), item))
     return item
Beispiel #32
0
 def find_one(self, endpoint_name, req, **lookup):
     backend = self._backend(endpoint_name)
     item = backend.find_one(endpoint_name, req=req, **lookup)
     search_backend = self._lookup_backend(endpoint_name, fallback=True)
     if search_backend:
         item_search = search_backend.find_one(endpoint_name, req=req, **lookup)
         if item is None and item_search:
             item = item_search
             logger.warn(item_msg('item is only in elastic', item))
         elif item_search is None and item:
             logger.warn(item_msg('item is only in mongo', item))
             try:
                 logger.info(item_msg('trying to add item to elastic', item))
                 search_backend.insert(endpoint_name, [item])
             except RequestError as e:
                 logger.error(item_msg('failed to add item into elastic error={}'.format(str(e)), item))
     return item
Beispiel #33
0
def remove_media_files(doc):
    """Removes the media files of the given doc.

    If media files are not references by any other
    story then delete the media files
    :param dict doc: document for which the media are being deleted
    :return boolean: True if files are deleted else false.
    """
    references = None

    if doc.get("renditions"):
        references = [doc.get("renditions")]

    if not references:
        references = [
            assoc.get("renditions")
            for assoc in (doc.get(ASSOCIATIONS) or {}).values()
            if assoc and assoc.get("renditions")
        ]

    if references:
        logger.info("Removing media files for %s", doc.get("guid"))

    for renditions in references:
        for rendition in renditions.values():
            media = rendition.get("media") if isinstance(
                rendition.get("media"), str) else str(rendition.get("media"))
            try:
                references = get_resource_service("media_references").get(
                    req=None, lookup={
                        "media_id": media,
                        "published": True
                    })

                if references.count() == 0:
                    logger.info("Deleting media:{}".format(
                        rendition.get("media")))
                    app.media.delete(media)
            except Exception:
                logger.exception(
                    "Failed to remove Media Id: {} from item: {}".format(
                        media, doc.get(config.ID_FIELD)))

    for attachment in doc.get("attachments", []):
        lookup = {"_id": attachment["attachment"]}
        get_resource_service("attachments").delete_action(lookup)
Beispiel #34
0
def remove_locks():
    """
    Removes item related locks that are not in use
    :return:
    """
    result = _lock.collection.delete_many({
        "$or": [
            {
                "_id": re.compile("^item_move"),
                "locked": False
            },
            {
                "_id": re.compile("^item_lock"),
                "locked": False
            },
        ]
    })
    logger.info("unused item locks deleted count={}".format(
        result.deleted_count))
    def _export_events(self):
        """Export events"""
        logger.info('Starting to export events')
        events_service = get_resource_service('events')

        formatter = JsonEventFormatter()
        destination = self._get_destination('json_event')
        formatter.set_destination(destination=destination,
                                  subscriber=self.subscriber)
        transmitter = NewsroomHTTPTransmitter()
        for items in self._fetch_items(events_service.get):
            for item in items:
                try:
                    logger.info('Processing event item: {}'.format(
                        item.get('_id')))
                    version, event = get_version_item_for_post(item)
                    queue_item = self._get_queue_item(event,
                                                      formatter._format_item,
                                                      destination)
                    transmitter.transmit(queue_item)
                    logger.info('Processing processed item: {}'.format(
                        item.get('_id')))
                except Exception:
                    logger.exception('Failed to export event: {}'.format(
                        item.get('_id')))
Beispiel #36
0
    def _flag_expired_planning(self, expiry_datetime):
        logger.info('{} Starting to flag expired planning items'.format(
            self.log_msg))
        planning_service = get_resource_service('planning')

        # Obtain the full list of Planning items that we're to process first
        # As subsequent queries will change the list of returnd items
        plans = dict()
        for items in planning_service.get_expired_items(expiry_datetime):
            plans.update({item[config.ID_FIELD]: item for item in items})

        locked_plans = set()
        plans_expired = set()

        for plan_id, plan in plans.items():
            if plan.get('lock_user'):
                locked_plans.add(plan_id)
            else:
                planning_service.system_update(plan[config.ID_FIELD],
                                               {'expired': True}, plan)
                plans_expired.add(plan_id)

        if len(locked_plans) > 0:
            logger.info('{} Skipping {} locked Planning items: {}'.format(
                self.log_msg, len(locked_plans), list(locked_plans)))

        if len(plans_expired) > 0:
            push_notification('planning:expired', items=list(plans_expired))

        logger.info('{} {} Planning items expired: {}'.format(
            self.log_msg, len(plans_expired), list(plans_expired)))
    def _export_planning(self):
        """Export events"""
        logger.info('Starting to export planning')
        planning_service = get_resource_service('planning')

        formatter = JsonPlanningFormatter()
        destination = self._get_destination('json_planning')
        formatter.set_destination(destination=destination,
                                  subscriber=self.subscriber)
        transmitter = NewsroomHTTPTransmitter()
        for items in self._fetch_items(planning_service.get):
            for item in items:
                try:
                    logger.info('Processing planning item: {}'.format(
                        item.get('_id')))
                    version, plan = get_version_item_for_post(item)
                    queue_item = self._get_queue_item(plan,
                                                      formatter._format_item,
                                                      destination)
                    transmitter.transmit(queue_item)
                    logger.info('Processed planning item: {}'.format(
                        item.get('item_id')))
                except Exception:
                    logger.exception(
                        'Failed to export planning item: {}'.format(
                            item.get('_id')))
    def _update(self, provider, update):
        self.provider = provider
        parser = self.get_feed_parser(provider)

        # get the current year, it is used to filter fixtures for this year and next
        year = int(utcnow().year) % 100
        config = provider.get('config', {})
        content = self._request(config.get('login_url').format(config.get('username'), config.get('password')))
        # get the configured sports
        configured_sports = config.get('sports').split(',')
        xml = ET.fromstring(content)
        if xml.attrib['Status_Code'] == 'OK':
            session = xml.attrib['Status_Session']
            content = self._request(config.get('fixtures_url').format(session, '', '', ''))
            xml = ET.fromstring(content)
            for s in xml.findall('.//Sports/Sport'):
                sport_id = s.attrib['SportID']
                if sport_id not in configured_sports:
                    continue
                sport_name = s.attrib['SportName']
                content = self._request(config.get('fixtures_url').format(session, sport_id, '', ''))
                sport_xml = ET.fromstring(content)
                for c in sport_xml.findall('.//Competition'):
                    comp_id = c.attrib.get('Comp_ID')
                    comp_name = c.attrib.get('Comp_Name')
                    content = self._request(config.get('fixtures_url').format(session, sport_id, comp_id, ''))
                    comp_xml = ET.fromstring(content)
                    for season in comp_xml.findall('.//Season'):
                        season_id = season.attrib.get('SeasonID')
                        if str(year) in season_id or str(year + 1) in season_id:
                            content = self._request(
                                config.get('fixtures_url').format(session, sport_id, comp_id, season_id))
                            fixture_xml = ET.fromstring(content)
                            logger.info('Parsing {}/{} {}/{}'.format(sport_id, sport_name, comp_id, comp_name))
                            items = parser.parse({'fixture_xml': fixture_xml, 'sport_id': sport_id,
                                                  'sport_name': sport_name, 'comp_name': comp_name, 'comp_id': comp_id},
                                                 provider)
                            if len(items) > 0:
                                yield items
Beispiel #39
0
    def generate_stats(self, item_id, gte, chunk_size):
        items_processed = 0
        failed_ids = []
        num_history_items = 0

        statistics_service = get_resource_service('archive_statistics')

        # Get the system record from the last run
        # This document stores the id of the last processed archive_history item
        last_history = statistics_service.get_last_run()
        last_entry_id = last_history.get('guid') or None

        if last_history.get('guid'):
            logger.info('Found previous run, continuing from history item {}'.format(
                last_history['guid']
            ))

        iterated_started = utcnow()
        for history_items in statistics_service.get_history_items(
                last_entry_id,
                gte,
                item_id,
                chunk_size
        ):
            if len(history_items) < 1:
                logger.info('No more history records to process')
                break

            num_history_items += len(history_items)
            last_entry_id = history_items[-1].get(config.ID_FIELD)

            items = self.gen_history_timelines(history_items)
            items_processed += len(items)
            self.process_timelines(items, failed_ids)

            time_diff = (utcnow() - iterated_started).total_seconds()
            logger.info('Processed {}/{} history/item records ({}/{} total) in {} seconds'.format(
                len(history_items),
                len(items),
                num_history_items,
                items_processed,
                int(time_diff)
            ))

            iterated_started = utcnow()

        # Don't store the last processed id if we're generating stats for a single item
        if not item_id:
            # Create/Update the system record from this run
            # Storing the id of the last processed archive_history item
            statistics_service.set_last_run_id(last_entry_id, last_history)

        return items_processed, failed_ids, num_history_items
    def _delete_marked_assignments(self):
        logger.info('{} Starting to delete marked assignments'.format(self.log_msg))
        assignments_service = get_resource_service('assignments')

        query = {
            'query': {
                'filtered': {
                    'filter': {
                        'bool': {
                            'must': {
                                'term': {'_to_delete': True}
                            },
                        }
                    }
                }
            }
        }
        req = ParsedRequest()
        req.args = {'source': json.dumps(query)}
        assignments_to_delete = assignments_service.get(req=req, lookup=None)
        failed_assignments = []
        assignments_deleted = []

        for assignment in assignments_to_delete:
            assign_id = assignment.get(config.ID_FIELD)
            try:
                assignments_service.delete_action(lookup={'_id': assign_id})
                assignments_deleted.append(
                    {
                        'id': assign_id,
                        'slugline': assignment.get('planning', {}).get('slugline'),
                        'type': assignment.get('planning', {}).get('g2_content_type')
                    }
                )
            except SuperdeskApiError as e:
                logger.exception(e)
                failed_assignments.append(assign_id)

        logger.info('{} {} Assignments deleted: {}'.format(self.log_msg,
                                                           len(assignments_deleted),
                                                           str(assignments_deleted)))

        if len(assignments_deleted) > 0:
            push_notification(
                'assignments:delete',
                items=assignments_deleted
            )

        if len(failed_assignments) > 0:
            logger.info(
                '{} {} assignments failed deletion: {}'.format(self.log_msg,
                                                               len(failed_assignments),
                                                               str(failed_assignments)))
Beispiel #41
0
    def _backend(self):
        if not app:
            raise RuntimeError('You can only use cache within app context.')

        if not app.cache:
            cache_url = app.config.get('CACHE_URL', '')
            if 'redis' in cache_url or 'unix' in cache_url:
                app.cache = SuperdeskRedisBackend(self.mangler, url=cache_url)
                logger.info('using redis cache backend')
            elif cache_url:
                import hermes.backend.memcached
                app.cache = hermes.backend.memcached.Backend(self.mangler, servers=[cache_url])
                logger.info('using memcached cache backend')
            else:
                app.cache = hermes.backend.dict.Backend(self.mangler)
                logger.info('using dict cache backend')

        return app.cache
    def _get_article_ids(self, channel, last_updated, updated):
        """
        Get article ids which should be upserted also save the poll token that is returned.
        """
        ids = set()
        payload = {'channel': channel, 'fieldsRef': 'id'}

        # check if the channel has a pollToken if not fall back to dateRange
        last_poll_token = self._get_poll_token(channel)
        if last_poll_token is not None:
            logger.info("Reuters requesting channel {} with poll token {}".format(channel, last_poll_token))
            payload['pollToken'] = last_poll_token
        else:
            payload['dateRange'] = "%s-%s" % (self._format_date(last_updated), self._format_date(updated))
            logger.info("Reuters requesting channel {} with dateRange {}".format(channel, payload['dateRange']))

        tree = self._get_tree('items', payload)
        status_code = tree.find('status').get('code') if tree.tag == 'results' else tree.get('code')
        # check the returned status
        if status_code != '10':
            logger.warn("Reuters channel request returned status code {}".format(status_code))
            # status code 30 indicates failure
            if status_code == '30':
                # invalid token
                logger.warn("Reuters error on channel {} code {} {}".format(channel, tree.find('error').get('code'),
                                                                            tree.find('error').text))
                if tree.find('error').get('code') == '2100':
                    self._save_poll_token(channel, None)
                    logger.warn("Reuters channel invalid token reseting {}".format(status_code))
                return ids

        # extract the returned poll token if there is one
        poll_token = tree.find('pollToken')
        if poll_token is not None:
            # a new token indicated new content
            if poll_token.text != last_poll_token:
                logger.info("Reuters channel {} new token {}".format(channel, poll_token.text))
                self._save_poll_token(channel, poll_token.text)
            else:
                # the token has not changed, so nothing new
                logger.info("Reuters channel {} nothing new".format(channel))
                return ids
        else:
            logger.info("Reuters channel {} retrieved no token".format(channel))
            return ids

        for result in tree.findall('result'):
            id = result.find('id').text
            ids.add(id)
            logger.info("Reuters id : {}".format(id))

        return ids