Example #1
0
    def run(self, ad_username, ad_password, username, admin='false'):
        """
        Imports or Updates a User Profile from AD to Mongo.
        :param ad_username: Active Directory Username
        :param ad_password: Password of Active Directory Username
        :param username: Username as in Active Directory whose profile needs to be imported to Superdesk.
        :return: User Profile.
        """

        # force type conversion to boolean
        user_type = 'administrator' if admin is not None and admin.lower() == 'true' else 'user'

        # Authenticate and fetch profile from AD
        settings = app.settings
        ad_auth = ADAuth(settings['LDAP_SERVER'], settings['LDAP_SERVER_PORT'], settings['LDAP_BASE_FILTER'],
                         settings['LDAP_USER_FILTER'], settings['LDAP_USER_ATTRIBUTES'], settings['LDAP_FQDN'])

        user_data = ad_auth.authenticate_and_fetch_profile(ad_username, ad_password, username)

        if len(user_data) == 0:
            raise SuperdeskApiError.notFoundError('Username not found')

        # Check if User Profile already exists in Mongo
        user = superdesk.get_resource_service('users').find_one(username=username, req=None)

        if user:
            superdesk.get_resource_service('users').patch(user.get('_id'), user_data)
        else:
            add_default_values(user_data, username, user_type=user_type)
            superdesk.get_resource_service('users').post([user_data])

        return user_data
 def _get_field_values(self):
     values = {}
     vocabularies_resource = get_resource_service('vocabularies')
     values['anpa_category'] = vocabularies_resource.find_one(req=None, _id='categories')['items']
     req = ParsedRequest()
     req.where = json.dumps({'$or': [{"schema_field": "genre"}, {"_id": "genre"}]})
     genre = vocabularies_resource.get(req=req, lookup=None)
     if genre.count():
         values['genre'] = genre[0]['items']
     values['urgency'] = vocabularies_resource.find_one(req=None, _id='urgency')['items']
     values['priority'] = vocabularies_resource.find_one(req=None, _id='priority')['items']
     values['type'] = vocabularies_resource.find_one(req=None, _id='type')['items']
     subject = vocabularies_resource.find_one(req=None, schema_field='subject')
     if subject:
         values['subject'] = subject['items']
     else:
         values['subject'] = get_subjectcodeitems()
     values['desk'] = list(get_resource_service('desks').get(None, {}))
     values['stage'] = self._get_stage_field_values(values['desk'])
     values['sms'] = [{'qcode': 0, 'name': 'False'}, {'qcode': 1, 'name': 'True'}]
     values['embargo'] = [{'qcode': 0, 'name': 'False'}, {'qcode': 1, 'name': 'True'}]
     req = ParsedRequest()
     req.where = json.dumps({'$or': [{"schema_field": "place"}, {"_id": "place"}, {"_id": "locators"}]})
     place = vocabularies_resource.get(req=req, lookup=None)
     if place.count():
         values['place'] = place[0]['items']
     values['ingest_provider'] = list(get_resource_service('ingest_providers').get(None, {}))
     return values
 def test_ingest_cancellation(self):
     provider_name = 'reuters'
     guid = 'tag_reuters.com_2016_newsml_L1N14N0FF:978556838'
     provider = get_resource_service('ingest_providers').find_one(name=provider_name, req=None)
     provider_service = self._get_provider_service(provider)
     provider_service.provider = provider
     provider_service.URL = provider.get('config', {}).get('url')
     items = provider_service.fetch_ingest(guid)
     for item in items:
         item['ingest_provider'] = provider['_id']
         item['expiry'] = utcnow() + timedelta(hours=11)
     self.ingest_items(items, provider, provider_service)
     guid = 'tag_reuters.com_2016_newsml_L1N14N0FF:1542761538'
     items = provider_service.fetch_ingest(guid)
     for item in items:
         item['ingest_provider'] = provider['_id']
         item['expiry'] = utcnow() + timedelta(hours=11)
     self.ingest_items(items, provider, provider_service)
     ingest_service = get_resource_service('ingest')
     lookup = {'uri': items[0].get('uri')}
     family_members = ingest_service.get_from_mongo(req=None, lookup=lookup)
     self.assertEqual(family_members.count(), 2)
     for relative in family_members:
         self.assertEqual(relative['pubstatus'], 'canceled')
         self.assertEqual(relative['state'], 'killed')
Example #4
0
    def _duplicate_versions(self, old_id, new_doc):
        """
        Duplicates the version history of the article identified by old_id. Each version identifiers are changed
        to have the identifiers of new_doc.

        :param old_id: identifier to fetch version history
        :param new_doc: identifiers from this doc will be used to create version history for the duplicated item.
        """
        resource_def = app.config['DOMAIN']['archive']
        version_id = versioned_id_field(resource_def)
        old_versions = get_resource_service('archive_versions').get(req=None, lookup={'guid': old_id})

        new_versions = []
        for old_version in old_versions:
            old_version[version_id] = new_doc[config.ID_FIELD]
            del old_version[config.ID_FIELD]

            old_version['guid'] = new_doc['guid']
            old_version['unique_name'] = new_doc['unique_name']
            old_version['unique_id'] = new_doc['unique_id']
            old_version['versioncreated'] = utcnow()
            if old_version[VERSION] == new_doc[VERSION]:
                old_version[ITEM_OPERATION] = new_doc[ITEM_OPERATION]
            new_versions.append(old_version)
        last_version = deepcopy(new_doc)
        last_version['_id_document'] = new_doc['_id']
        del last_version['_id']
        new_versions.append(last_version)
        if new_versions:
            get_resource_service('archive_versions').post(new_versions)
Example #5
0
    def test_products(self, article):
        req = ParsedRequest()
        results = []
        products = list(get_resource_service('products').get(req=req, lookup=None))
        for product in products:
            result = {}
            result['product_id'] = product['_id']
            result['matched'] = True

            reason = ''
            if not EnqueueService().conforms_product_targets(product, article):
                # Here it fails to match due to geo restriction
                # story has target_region and product has geo restriction
                result['matched'] = False

                if BasePublishService().is_targeted(article, 'target_regions'):
                    reason = 'Story has target_region'

                if product.get('geo_restrictions'):
                    reason = '{} {}'.format(reason, 'Product has target_region')

            if not EnqueueService().conforms_content_filter(product, article):
                # Here it fails to match due to content filter
                content_filter = product.get('content_filter')
                filter = get_resource_service('content_filters').find_one(req=None, _id=content_filter['filter_id'])
                result['matched'] = False
                reason = 'Story does not match the filter: {}'.format(filter.get('name'))

            result['reason'] = reason
            results.append(result)
        return results
def get_provider_routing_scheme(provider):
    """Returns the ingests provider's routing scheme configuration.

    If provider has a routing scheme defined (i.e. scheme ID is not None), the
    scheme is fetched from the database. If not, nothing is returned.

    For all scheme rules that have a reference to a content filter defined,
    that filter's configuration is fetched from the database as well and
    embedded into the corresponding scheme rule.

    :param dict provider: ingest provider configuration

    :return: fetched provider's routing scheme configuration (if any)
    :rtype: dict or None
    """

    if not provider.get('routing_scheme'):
        return None

    schemes_service = superdesk.get_resource_service('routing_schemes')
    filters_service = superdesk.get_resource_service('content_filters')

    scheme = schemes_service.find_one(_id=provider['routing_scheme'], req=None)

    # for those routing rules that have a content filter defined,
    # get that filter from DB and embed it into the rule...
    rules_filters = (
        (rule, str(rule['filter']))
        for rule in scheme['rules'] if rule.get('filter'))

    for rule, filter_id in rules_filters:
        content_filter = filters_service.find_one(_id=filter_id, req=None)
        rule['filter'] = content_filter

    return scheme
    def test_expiring_with_content(self):
        provider_name = 'reuters'
        guid = 'tag_reuters.com_2014_newsml_KBN0FL0NM:10'
        provider = get_resource_service('ingest_providers').find_one(name=provider_name, req=None)
        provider_service = self._get_provider_service(provider)
        provider_service.provider = provider
        provider_service.URL = provider.get('config', {}).get('url')
        items = provider_service.fetch_ingest(guid)
        now = utcnow()
        for i, item in enumerate(items):
            item['ingest_provider'] = provider['_id']
            expiry_time = now - timedelta(hours=11)
            if i > 4:
                expiry_time = now + timedelta(minutes=11)

            item['expiry'] = item['versioncreated'] = expiry_time

        service = get_resource_service('ingest')
        service.post(items)

        # ingest the items and expire them
        before = service.get(req=None, lookup={})
        self.assertEqual(6, before.count())

        remove = RemoveExpiredContent()
        remove.run(provider.get('type'))

        # only one left in ingest
        after = service.get(req=None, lookup={})
        self.assertEqual(1, after.count())
    def create(self, docs, **kwargs):
        doc = docs[0] if len(docs) > 0 else {}
        original_id = request.view_args['original_id']
        update_document = doc.get('update')

        archive_service = get_resource_service(ARCHIVE)
        original = archive_service.find_one(req=None, _id=original_id)
        self._validate_rewrite(original, update_document)

        digital = TakesPackageService().get_take_package(original)
        rewrite = self._create_rewrite_article(original, digital,
                                               existing_item=update_document,
                                               desk_id=doc.get('desk_id'))

        if update_document:
            # process the existing story
            archive_service.patch(update_document[config.ID_FIELD], rewrite)
            rewrite[config.ID_FIELD] = update_document[config.ID_FIELD]
            ids = [update_document[config.ID_FIELD]]
        else:
            ids = archive_service.post([rewrite])
            build_custom_hateoas(CUSTOM_HATEOAS, rewrite)

        self._add_rewritten_flag(original, digital, rewrite)
        get_resource_service('archive_broadcast').on_broadcast_master_updated(ITEM_CREATE,
                                                                              item=original,
                                                                              rewrite_id=ids[0])
        return [rewrite]
Example #9
0
 def run(self, republish):
     # update themes
     theme_service = get_resource_service('themes')
     created, updated = theme_service.update_registered_theme_with_local_files()
     print('\n* %d themes updated from local files\n' % (len(created) + len(updated)))
     # retrieves all opened blogs
     blogs_service = get_resource_service('blogs')
     blogs = blogs_service.get(req=None, lookup=dict(blog_status='open'))
     print('* Update the theme for every blog\n')
     for blog in blogs:
         theme = blogs_service.get_theme_snapshot(blog['blog_preferences']['theme'])
         try:
             blogs_service.system_update(ObjectId(blog['_id']), {'theme': theme}, blog)
         except eve.io.base.DataLayer.OriginalChangedError:
             print(u'! an error occured during saving blog "%s".' % (blog['title']),
                   'Can be a broken relationship (with user for instance)')
         else:
             print('- Blog "%s"\'s theme was updated to %s %s' % (
                 blog['title'], theme['name'], theme['version']))
     # republish on s3
     if republish:
         print('\n* Republishing blogs:\n')
         for blog in blogs:
             url = publish_blog_embed_on_s3(blog_id=str(blog['_id']), safe=False)
             print('  - Blog "%s" republished: %s' % (blog['title'], url))
Example #10
0
 def upload_fixture_image(
     self, fixture_image_path,
     verification_stats_path, verification_result_path, headline='test'
 ):
     with self.app.app_context():
         with open(fixture_image_path, mode='rb') as f:
             file_name = ntpath.basename(fixture_image_path)
             file_type = 'image'
             content_type = '%s/%s' % (file_type, imghdr.what(f))
             file_id = app.media.put(
                 f, filename=file_name,
                 content_type=content_type,
                 resource=get_resource_service('ingest').datasource,
                 metadata={}
             )
             inserted = [file_id]
             renditions = generate_renditions(
                 f, file_id, inserted, file_type, content_type,
                 rendition_config=config.RENDITIONS['picture'],
                 url_for_media=url_for_media
             )
         data = [{
             'headline': headline,
             'slugline': 'rebuild',
             'renditions': renditions,
             'type': 'picture'
         }]
         image_id = get_resource_service('ingest').post(data)
     with open(verification_result_path, 'r') as f:
         self.expected_verification_results.append(json.load(f))
     with open(verification_stats_path, 'r') as f:
         self.expected_verification_stats.append(json.load(f))
     return image_id
Example #11
0
 def enhance_document_with_user_privileges(self, session_doc, user_doc):
     role_doc = get_resource_service('users').get_role(user_doc)
     get_resource_service('users').set_privileges(user_doc, role_doc)
     session_doc[_privileges_key] = user_doc.get(_privileges_key, {})
     # set last_updated to max for session/user/role so that client will fetch changes
     # after a change to any of those
     session_doc[app.config['LAST_UPDATED']] = last_updated(session_doc, user_doc, role_doc)
Example #12
0
 def find_one(self, req, **lookup):
     session = get_resource_service('sessions').find_one(req=None, _id=lookup['_id'])
     _id = session['user'] if session else lookup['_id']
     doc = get_resource_service('users').find_one(req, _id=_id)
     if doc:
         doc['_id'] = session['_id'] if session else _id
     return doc
Example #13
0
def transmit_items(queue_items, subscriber, destination, output_channels):
    failed_items = []

    for queue_item in queue_items:
        # Check if output channel is active
        if not (output_channels.get(str(queue_item['output_channel_id']), {})).get('is_active', False):
            continue

        try:
            if not is_on_time(queue_item, destination):
                continue

            # update the status of the item to in-progress
            queue_update = {'state': 'in-progress', 'transmit_started_at': utcnow()}
            superdesk.get_resource_service('publish_queue').patch(queue_item.get('_id'), queue_update)

            # get the formatted item
            formatted_item = superdesk.get_resource_service('formatted_item').\
                find_one(req=None, _id=queue_item['formatted_item_id'])

            transmitter = superdesk.publish.transmitters[destination.get('delivery_type')]
            transmitter.transmit(queue_item, formatted_item, subscriber, destination)
            update_content_state(queue_item)
        except:
            failed_items.append(queue_item)

    if len(failed_items) > 0:
        logger.error('Failed to publish the following items: %s', str(failed_items))
Example #14
0
    def unlock(self, item_filter, user_id, session_id, etag):
        item_model = get_model(ItemModel)
        item = item_model.find_one(item_filter)

        if not item:
            raise SuperdeskApiError.notFoundError()

        if not item.get(LOCK_USER):
            raise SuperdeskApiError.badRequestError(message="Item is not locked.")

        can_user_unlock, error_message = self.can_unlock(item, user_id)

        if can_user_unlock:
            self.app.on_item_unlock(item, user_id)

            # delete the item if nothing is saved so far
            # version 0 created on lock item
            if item.get(config.VERSION, 0) == 0 and item[ITEM_STATE] == CONTENT_STATE.DRAFT:
                superdesk.get_resource_service('archive').delete_action(lookup={'_id': item['_id']})
                push_content_notification([item])
            else:
                updates = {LOCK_USER: None, LOCK_SESSION: None, 'lock_time': None, 'force_unlock': True}
                item_model.update(item_filter, updates)
                self.app.on_item_unlocked(item, user_id)

            push_notification('item:unlock',
                              item=str(item_filter.get(config.ID_FIELD)),
                              item_version=str(item.get(config.VERSION)),
                              state=item.get(ITEM_STATE),
                              user=str(user_id), lock_session=str(session_id))
        else:
            raise SuperdeskApiError.forbiddenError(message=error_message)

        item = item_model.find_one(item_filter)
        return item
Example #15
0
    def update(self, id, updates, original):
        original_state = original[config.CONTENT_STATE]
        if not is_workflow_state_transition_valid("spike", original_state):
            raise InvalidStateTransitionError()

        package_service = PackageService()
        user = get_user(required=True)

        item = get_resource_service(ARCHIVE).find_one(req=None, _id=id)
        expiry_minutes = app.settings["SPIKE_EXPIRY_MINUTES"]

        # check if item is in a desk. If it's then use the desks spike_expiry
        if is_assigned_to_a_desk(item):
            desk = get_resource_service("desks").find_one(_id=item["task"]["desk"], req=None)
            expiry_minutes = desk.get("spike_expiry", expiry_minutes)

        updates[EXPIRY] = get_expiry_date(expiry_minutes)
        updates[REVERT_STATE] = item.get(app.config["CONTENT_STATE"], None)

        if original.get("rewrite_of"):
            updates["rewrite_of"] = None

        item = self.backend.update(self.datasource, id, updates, original)
        push_notification("item:spike", item=str(item.get("_id")), user=str(user))
        package_service.remove_spiked_refs_from_package(id)
        return item
Example #16
0
    def restore_version(self, id, doc):
        item_id = id
        old_version = int(doc.get('old_version', 0))
        last_version = int(doc.get('last_version', 0))
        if (not all([item_id, old_version, last_version])):
            return None

        old = get_resource_service('archive_versions').find_one(req=None, _id_document=item_id, _version=old_version)
        if old is None:
            raise SuperdeskApiError.notFoundError('Invalid version %s' % old_version)

        curr = get_resource_service(SOURCE).find_one(req=None, _id=item_id)
        if curr is None:
            raise SuperdeskApiError.notFoundError('Invalid item id %s' % item_id)

        if curr[config.VERSION] != last_version:
            raise SuperdeskApiError.preconditionFailedError('Invalid last version %s' % last_version)
        old['_id'] = old['_id_document']
        old['_updated'] = old['versioncreated'] = utcnow()
        set_item_expiry(old, doc)
        del old['_id_document']

        resolve_document_version(old, 'archive', 'PATCH', curr)

        remove_unwanted(old)
        res = super().replace(id=item_id, document=old)

        del doc['old_version']
        del doc['last_version']
        doc.update(old)
        return res
Example #17
0
    def lock(self, item_filter, user_id, session_id, etag):
        item_model = get_model(ItemModel)
        item = item_model.find_one(item_filter)

        if not item:
            raise SuperdeskApiError.notFoundError()

        can_user_lock, error_message = self.can_lock(item, user_id, session_id)

        if can_user_lock:
            self.app.on_item_lock(item, user_id)
            updates = {LOCK_USER: user_id, LOCK_SESSION: session_id, 'lock_time': utcnow()}
            item_model.update(item_filter, updates)

            if item.get(TASK):
                item[TASK]['user'] = user_id
            else:
                item[TASK] = {'user': user_id}

            superdesk.get_resource_service('tasks').assign_user(item[config.ID_FIELD], item[TASK])
            self.app.on_item_locked(item, user_id)
            push_notification('item:lock',
                              item=str(item.get(config.ID_FIELD)),
                              item_version=str(item.get(config.VERSION)),
                              user=str(user_id), lock_time=updates['lock_time'],
                              lock_session=str(session_id))
        else:
            raise SuperdeskApiError.forbiddenError(message=error_message)

        item = item_model.find_one(item_filter)
        return item
Example #18
0
    def delete(self, lookup):
        """
        Overriding to delete stages before deleting a desk
        """

        superdesk.get_resource_service('stages').delete(lookup={'desk': lookup.get(config.ID_FIELD)})
        super().delete(lookup)
Example #19
0
 def deschedule_item(self, updates, doc):
     updates['state'] = 'in_progress'
     updates['publish_schedule'] = None
     # delete entries from publish queue
     get_resource_service('publish_queue').delete_by_article_id(doc['_id'])
     # delete entry from published repo
     get_resource_service('published').delete_by_article_id(doc['_id'])
    def test_files_dont_duplicate_ingest(self):
        provider_name = 'reuters'
        guid = 'tag_reuters.com_2014_newsml_KBN0FL0NM'
        provider = get_resource_service('ingest_providers').find_one(name=provider_name, req=None)
        provider_service = self.provider_services[provider.get('type')]
        provider_service.provider = provider

        items = provider_service.fetch_ingest(guid)
        for item in items:
            item['ingest_provider'] = provider['_id']
            item['expiry'] = utcnow() + timedelta(hours=11)

        service = get_resource_service('ingest')
        service.post(items)

        # ingest the items
        self.ingest_items(items, provider)

        items = provider_service.fetch_ingest(guid)
        for item in items:
            item['ingest_provider'] = provider['_id']
            item['expiry'] = utcnow() + timedelta(hours=11)

        # ingest them again
        self.ingest_items(items, provider)

        # 12 files in grid fs
        current_files = self.app.media.fs('upload').find()
        self.assertEqual(12, current_files.count())
    def test_subject_to_anpa_category_derived_ingest_ignores_inactive_map_entries(self):
        vocab = [{'_id': 'iptc_category_map',
                  'items': [{'name': 'Finance', 'category': 'f', 'subject': '04000000', 'is_active': False}]},
                 {'_id': 'categories',
                  'items': [{'is_active': True, 'name': 'Australian Weather', 'qcode': 'b', 'subject': '17000000'}]}]

        self.app.data.insert('vocabularies', vocab)

        provider_name = 'AAP'
        guid = 'nitf-fishing.xml'
        provider = get_resource_service('ingest_providers').find_one(name=provider_name, req=None)
        provider_service = self.provider_services[provider.get('type')]
        provider_service.provider = provider

        items = provider_service.parse_file(guid, provider)
        for item in items:
            item['ingest_provider'] = provider['_id']
            item['expiry'] = utcnow() + timedelta(hours=11)

        service = get_resource_service('ingest')
        service.post(items)

        # ingest the items and check the subject code has been derived
        self.ingest_items(items, provider)
        self.assertNotIn('anpa_category', items[0])
Example #22
0
def get_expiry(desk_id, stage_id, offset=None):
    """
    Calculates the expiry for a content from fetching the expiry duration from one of the below
        1. desk identified by desk_id
        2. stage identified by stage_id
    :param desk_id: desk identifier
    :param stage_id: stage identifier
    :return: when the doc will expire
    """
    stage = None
    desk = None

    if desk_id:
        desk = superdesk.get_resource_service('desks').find_one(req=None, _id=desk_id)

        if not desk:
            raise SuperdeskApiError.notFoundError('Invalid desk identifier %s' % desk_id)

    if stage_id:
        stage = get_resource_service('stages').find_one(req=None, _id=stage_id)

        if not stage:
                raise SuperdeskApiError.notFoundError('Invalid stage identifier %s' % stage_id)

    return get_item_expiry(desk, stage, offset)
Example #23
0
def get_expiry(desk_id=None, stage_id=None, desk_or_stage_doc=None):
    """
    Calculates the expiry for a content from fetching the expiry duration from one of the below
        1. desk identified by desk_id
        2. stage identified by stage_id. This will ignore desk_id if specified
        3. desk doc or stage doc identified by doc_or_stage_doc. This will ignore desk_id and stage_id if specified

    :param desk_id: desk identifier
    :param stage_id: stage identifier
    :param desk_or_stage_doc: doc from either desks collection or stages collection
    :return: when the doc will expire
    """

    stage = None

    if desk_or_stage_doc is None and desk_id:
        desk = superdesk.get_resource_service('desks').find_one(req=None, _id=desk_id)

        if not desk:
            raise SuperdeskApiError.notFoundError('Invalid desk identifier %s' % desk_id)

        if not stage_id:
            stage = get_resource_service('stages').find_one(req=None, _id=desk['incoming_stage'])

            if not stage:
                raise SuperdeskApiError.notFoundError('Invalid stage identifier %s' % stage_id)

    if desk_or_stage_doc is None and stage_id:
        stage = get_resource_service('stages').find_one(req=None, _id=stage_id)

        if not stage:
                raise SuperdeskApiError.notFoundError('Invalid stage identifier %s' % stage_id)

    return get_item_expiry(app=app, stage=desk_or_stage_doc or stage)
Example #24
0
    def _validate_disable(self, updates, original):
        """
        Checks the templates and desks that are referencing the given
        content profile if the profile is being disabled
        """
        if 'enabled' in updates and updates.get('enabled') is False and original.get('enabled') is True:
            templates = list(superdesk.get_resource_service('content_templates').
                             get_templates_by_profile_id(original.get('_id')))

            if len(templates) > 0:
                template_names = ', '.join([t.get('template_name') for t in templates])
                raise SuperdeskApiError.badRequestError(
                    message='Cannot disable content profile as following templates are referencing: {}'.
                    format(template_names))

            req = ParsedRequest()
            all_desks = list(superdesk.get_resource_service('desks').get(req=req, lookup={}))
            profile_desks = [desk for desk in all_desks if
                             desk.get('default_content_profile') == str(original.get('_id'))]

            if len(profile_desks) > 0:
                profile_desk_names = ', '.join([d.get('name') for d in profile_desks])
                raise SuperdeskApiError.badRequestError(
                    message='Cannot disable content profile as following desks are referencing: {}'.
                    format(profile_desk_names))
Example #25
0
    def queue_transmission(self, doc, subscribers):
        """
        Method formats and then queues the article for transmission to the passed subscribers.
        ::Important Note:: Format Type across Subscribers can repeat. But we can't have formatted item generated once
        based on the format_types configured across for all the subscribers as the formatted item must have a published
        sequence number generated by Subscriber.
        :param dict doc: document to queue for transmission
        :param list subscribers: List of subscriber dict.
        :return : (list, bool) tuple of list of missing formatters and boolean flag. True if queued else False
        """

        try:
            queued = False
            no_formatters = []
            for subscriber in subscribers:
                try:
                    if doc[ITEM_TYPE] not in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED] and \
                            subscriber.get('subscriber_type', '') == SUBSCRIBER_TYPES.WIRE:
                        # wire subscribers can get only text and preformatted stories
                        continue

                    for destination in subscriber['destinations']:
                        # Step 2(a)
                        formatter = get_formatter(destination['format'], doc)

                        if not formatter:  # if formatter not found then record it
                            no_formatters.append(destination['format'])
                            continue

                        formatted_docs = formatter.format(doc, subscriber)

                        for pub_seq_num, formatted_doc in formatted_docs:
                            publish_queue_item = dict()
                            publish_queue_item['item_id'] = doc['_id']
                            publish_queue_item['item_version'] = doc[config.VERSION]
                            publish_queue_item['formatted_item'] = formatted_doc
                            publish_queue_item['subscriber_id'] = subscriber['_id']
                            publish_queue_item['destination'] = destination
                            publish_queue_item['published_seq_num'] = pub_seq_num
                            publish_queue_item['publish_schedule'] = doc.get('publish_schedule', None)
                            publish_queue_item['unique_name'] = doc.get('unique_name', None)
                            publish_queue_item['content_type'] = doc.get('type', None)
                            publish_queue_item['headline'] = doc.get('headline', None)

                            self.set_state(doc, publish_queue_item)
                            if publish_queue_item.get(ITEM_STATE):
                                publish_queue_item['publishing_action'] = publish_queue_item.get(ITEM_STATE)
                                del publish_queue_item[ITEM_STATE]
                            else:
                                publish_queue_item['publishing_action'] = self.published_state

                            get_resource_service('publish_queue').post([publish_queue_item])
                            queued = True
                except:
                    logger.exception("Failed to queue item for id {} with headline {} for subscriber {}."
                                     .format(doc.get(config.ID_FIELD), doc.get('headline'), subscriber.get('name')))

            return no_formatters, queued
        except:
            raise
    def test_subject_to_anpa_category_derived_ingest_ignores_inactive_map_entries(self):
        vocab = [{'_id': 'iptc_category_map',
                  'items': [{'name': 'Finance', 'category': 'f', 'subject': '04000000', 'is_active': False}]},
                 {'_id': 'categories',
                  'items': [{'is_active': True, 'name': 'Australian Weather', 'qcode': 'b', 'subject': '17000000'}]}]

        self.app.data.insert('vocabularies', vocab)

        provider_name = 'AAP'
        provider = get_resource_service('ingest_providers').find_one(name=provider_name, req=None)
        file_path = os.path.join(provider.get('config', {}).get('path', ''), 'nitf-fishing.xml')
        provider_service = self._get_provider_service(provider)
        feeding_parser = provider_service.get_feed_parser(provider)
        with open(file_path, 'r') as f:
            xml_string = etree.etree.fromstring(f.read())
            items = [feeding_parser.parse(xml_string, provider)]
            for item in items:
                item['ingest_provider'] = provider['_id']
                item['expiry'] = utcnow() + timedelta(hours=11)

            service = get_resource_service('ingest')
            service.post(items)

            # ingest the items and check the subject code has been derived
            self.ingest_items(items, provider)
            self.assertNotIn('anpa_category', items[0])
Example #27
0
    def on_delete(self, doc):
        """
        Checks if deleting the stage would not violate data integrity, raises an exception if it does.

            1/ Can't delete the default incoming stage
            2/ The stage must have no documents (spiked or unspiked)
            3/ The stage can not be referred to by a ingest routing rule

        :param doc:
        :return:
        """
        if doc['default_incoming'] is True:
            desk_id = doc.get('desk', None)
            if desk_id and superdesk.get_resource_service('desks').find_one(req=None, _id=desk_id):
                raise SuperdeskApiError.preconditionFailedError(message='Cannot delete a default stage.')

        archive_versions_query = {'task.stage': str(doc[config.ID_FIELD])}
        items = superdesk.get_resource_service('archive_versions').get(req=None, lookup=archive_versions_query)
        if items and items.count():
            raise SuperdeskApiError.preconditionFailedError(
                message='Cannot delete stage as it has article(s) or referenced by versions of the article(s).')

        # check if the stage is referred to in a ingest routing rule
        rules = self._stage_in_rule(doc[config.ID_FIELD])
        if rules.count() > 0:
            rule_names = ', '.join(rule.get('name') for rule in rules)
            raise SuperdeskApiError.preconditionFailedError(
                message='Stage is referred by Ingest Routing Schemes : {}'.format(rule_names))
    def data_scaffolding_test(self):
        with self.app.app_context():
            command = AppInitializeWithDataCommand()
            result = command.run()
            self.assertEquals(result, 0)

            service = get_resource_service('text_archive')
            docs = [{
                'type': 'text',
                'abstract': 'test abstract {}'.format(x),
                'headline': 'test headline {}'.format(x),
                'body_html': 'test long story body {}'.format(x)
            } for x in range(0, 40)]
            service.post(docs)

            stories_per_desk = 2
            existing_desks = 1
            command = AppScaffoldDataCommand()
            result = command.run(stories_per_desk)
            self.assertEquals(result, 0)

            cursor = get_resource_service('desks').get_from_mongo(None, {})
            self.assertEquals(cursor.count(), existing_desks)

            cursor = get_resource_service('archive').get_from_mongo(None, {})
            self.assertEquals(cursor.count(), existing_desks * stories_per_desk)
Example #29
0
    def on_deleted(self, doc):
        """
        Overriding clean up reset password tokens:
        """

        super().on_deleted(doc)
        get_resource_service('reset_user_password').remove_all_tokens_for_email(doc.get('email'))
    def test_expiring_content_with_files(self):
        provider_name = 'reuters'
        guid = 'tag_reuters.com_2014_newsml_KBN0FL0NM'
        provider = get_resource_service('ingest_providers').find_one(name=provider_name, req=None)
        provider_service = self.provider_services[provider.get('type')]
        provider_service.provider = provider

        items = provider_service.fetch_ingest(guid)
        for item in items:
            item['ingest_provider'] = provider['_id']

        now = utcnow()
        items[0]['expiry'] = now - timedelta(hours=11)
        items[1]['expiry'] = now - timedelta(hours=11)
        items[2]['expiry'] = now + timedelta(hours=11)
        items[5]['versioncreated'] = now + timedelta(minutes=11)

        service = get_resource_service('ingest')
        service.post(items)

        # ingest the items and expire them
        self.ingest_items(items, provider)

        # four files in grid fs
        current_files = self.app.media.fs('upload').find()
        self.assertEqual(4, current_files.count())

        remove = RemoveExpiredContent()
        remove.run(provider.get('type'))

        # all gone
        current_files = self.app.media.fs('upload').find()
        self.assertEqual(0, current_files.count())
Example #31
0
    def on_delete(self, doc):
        """Runs on delete of archive item.

        Overriding to validate the item being killed is actually eligible for kill. Validates the following:
            1. Is item of type Text?
            2. Is item a Broadcast Script?
            3. Does item acts as a Master Story for any of the existing broadcasts?
            4. Is item available in production or part of a normal package?
            5. Is the associated Digital Story is available in production or part of normal package?
            6. If item is a Take then is any take available in production or part of normal package?

        :param doc: represents the article in archived collection
        :type doc: dict
        :raises SuperdeskApiError.badRequestError() if any of the above validation conditions fail.
        """

        bad_req_error = SuperdeskApiError.badRequestError

        id_field = doc[config.ID_FIELD]
        item_id = doc['item_id']

        doc['item_id'] = id_field
        doc[config.ID_FIELD] = item_id

        if doc[ITEM_TYPE] != CONTENT_TYPE.TEXT:
            raise bad_req_error(
                message=
                'Only Text articles are allowed to Kill in Archived repo')

        if is_genre(doc, BROADCAST_GENRE):
            raise bad_req_error(
                message=
                "Killing of Broadcast Items isn't allowed in Archived repo")

        if get_resource_service(
                'archive_broadcast').get_broadcast_items_from_master_story(
                    doc, True):
            raise bad_req_error(
                message=
                "Can't kill as this article acts as a Master Story for existing broadcast(s)"
            )

        if get_resource_service(ARCHIVE).find_one(req=None,
                                                  _id=doc[GUID_FIELD]):
            raise bad_req_error(
                message="Can't Kill as article is still available in production"
            )

        if is_item_in_package(doc):
            raise bad_req_error(
                message="Can't kill as article is part of a Package")

        takes_package_service = TakesPackageService()
        takes_package_id = takes_package_service.get_take_package_id(doc)
        if takes_package_id:
            if get_resource_service(ARCHIVE).find_one(req=None,
                                                      _id=takes_package_id):
                raise bad_req_error(
                    message=
                    "Can't Kill as the Digital Story is still available in production"
                )

            req = ParsedRequest()
            req.sort = '[("%s", -1)]' % config.VERSION
            takes_package = list(
                self.get(req=req, lookup={'item_id': takes_package_id}))
            if not takes_package:
                raise bad_req_error(
                    message=
                    'Digital Story of the article not found in Archived repo')

            takes_package = takes_package[0]
            if is_item_in_package(takes_package):
                raise bad_req_error(
                    message="Can't kill as Digital Story is part of a Package")

            for takes_ref in takes_package_service.get_package_refs(
                    takes_package):
                if takes_ref[RESIDREF] != doc[GUID_FIELD]:
                    if get_resource_service(ARCHIVE).find_one(
                            req=None, _id=takes_ref[RESIDREF]):
                        raise bad_req_error(
                            message=
                            "Can't Kill as Take(s) are still available in production"
                        )

                    take = list(
                        self.get(req=None,
                                 lookup={'item_id': takes_ref[RESIDREF]}))
                    if not take:
                        raise bad_req_error(
                            message='One of Take(s) not found in Archived repo'
                        )

                    if is_item_in_package(take[0]):
                        raise bad_req_error(
                            message=
                            "Can't kill as one of Take(s) is part of a Package"
                        )

        doc['item_id'] = item_id
        doc[config.ID_FIELD] = id_field
Example #32
0
 def number_of_data_updates_applied(self):
     return get_resource_service('data_updates').find({}).count()
Example #33
0
def get_expired_items(provider_id, ingest_collection):
    query_filter = get_query_for_expired_items(provider_id)
    return superdesk.get_resource_service(ingest_collection).get_from_mongo(
        lookup=query_filter, req=None)
Example #34
0
def delete(id):
    """ Deletes the user by given id """
    get_resource_service('users').delete({'_id': ObjectId(id)})
    return jsonify({'success': True}), 200
Example #35
0
def step_impl_given_role(context, role_name):
    with context.app.test_request_context(context.app.config['URL_PREFIX']):
        role = get_resource_service('roles').find_one(name=role_name, req=None)
        data = json.dumps({'roles': [str(role['_id'])]})
    response = patch_current_user(context, data)
    assert_ok(response)
Example #36
0
def step_impl_given_empty(context, resource):
    with context.app.test_request_context(context.app.config['URL_PREFIX']):
        get_resource_service(resource).delete_action()
Example #37
0
 def on_created(self, docs):
     for doc in docs:
         get_resource_service('preferences').set_session_based_prefs(
             doc['_id'], doc['user'])
Example #38
0
 def remove_old_default(self, desk, field):
     lookup = {'$and': [{field: True}, {'desk': str(desk)}]}
     stages = self.get(req=None, lookup=lookup)
     for stage in stages:
         get_resource_service('stages').update(stage.get('_id'),
                                               {field: False}, stage)
Example #39
0
 def get_stage_documents(self, stage_id):
     query_filter = superdesk.json.dumps({'term': {'task.stage': stage_id}})
     req = ParsedRequest()
     req.args = {'filter': query_filter}
     return superdesk.get_resource_service(ARCHIVE).get(req, None)
Example #40
0
    def update(self, id, updates, original):
        """Runs on update of archive item.

        Overriding to handle with Kill workflow in the Archived repo:
            1. Check if Article has an associated Digital Story and if Digital Story has more Takes.
               If both Digital Story and more Takes exists then all of them would be killed along with the one requested
            2. If the item is flagged as archived only then it was never created by or published from the system so all
                that needs to be done is to delete it and send an email to all subscribers
            3. For each article being killed do the following:
                i.   Create an entry in archive, archive_versions and published collections.
                ii.  Query the Publish Queue in Legal Archive and find the subscribers who received the article
                     previously and create transmission entries in Publish Queue.
                iii. Change the state of the article to Killed in Legal Archive.
                iv.  Delete all the published versions from Archived.
                v.   Send a broadcast email to all subscribers.
        :param id: primary key of the item to be killed
        :type id: str
        :param updates: updates to be applied on the article before saving
        :type updates: dict
        :param original:
        :type original: dict
        """

        # Step 1
        articles_to_kill = self._find_articles_to_kill({'_id': id})
        logger.info('Fetched articles to kill for id: {}'.format(id))
        articles_to_kill.sort(
            key=itemgetter(ITEM_TYPE),
            reverse=True)  # Needed because package has to be inserted last
        kill_service = KillPublishService()

        updated = original.copy()

        for article in articles_to_kill:
            updates_copy = deepcopy(updates)
            kill_service.apply_kill_override(article, updates_copy)
            updated.update(updates_copy)
            # Step 2, If it is flagged as archived only it has no related items in the system so can be deleted.
            # An email is sent to all subscribers
            if original.get('flags', {}).get('marked_archived_only', False):
                super().delete({'item_id': article['item_id']})
                logger.info('Delete for article: {}'.format(
                    article[config.ID_FIELD]))
                kill_service.broadcast_kill_email(article, updates_copy)
                logger.info('Broadcast kill email for article: {}'.format(
                    article[config.ID_FIELD]))
                continue

            # Step 3(i)
            self._remove_and_set_kill_properties(article, articles_to_kill,
                                                 updated)
            logger.info(
                'Removing and setting properties for article: {}'.format(
                    article[config.ID_FIELD]))

            # Step 3(ii)
            transmission_details = list(
                get_resource_service(LEGAL_PUBLISH_QUEUE_NAME).get(
                    req=None, lookup={'item_id': article['item_id']}))

            if transmission_details:
                subscriber_ids = [
                    t['_subscriber_id'] for t in transmission_details
                ]
                query = {'$and': [{config.ID_FIELD: {'$in': subscriber_ids}}]}
                subscribers = list(
                    get_resource_service('subscribers').get(req=None,
                                                            lookup=query))

                EnqueueKilledService().queue_transmission(article, subscribers)
                logger.info('Queued Transmission for article: {}'.format(
                    article[config.ID_FIELD]))

            article[config.ID_FIELD] = article.pop('item_id',
                                                   article['item_id'])

            # Step 3(iv)
            super().delete({'item_id': article[config.ID_FIELD]})
            logger.info('Delete for article: {}'.format(
                article[config.ID_FIELD]))

            # Step 3(i) - Creating entries in published collection
            docs = [article]
            get_resource_service(ARCHIVE).post(docs)
            insert_into_versions(doc=article)
            published_doc = deepcopy(article)
            published_doc[QUEUE_STATE] = PUBLISH_STATE.QUEUED
            get_resource_service('published').post([published_doc])
            logger.info(
                'Insert into archive and published for article: {}'.format(
                    article[config.ID_FIELD]))

            # Step 3(iii)
            import_into_legal_archive.apply_async(
                countdown=3, kwargs={'item_id': article[config.ID_FIELD]})
            logger.info('Legal Archive import for article: {}'.format(
                article[config.ID_FIELD]))

            # Step 3(v)
            kill_service.broadcast_kill_email(article, updates)
            logger.info('Broadcast kill email for article: {}'.format(
                article[config.ID_FIELD]))
Example #41
0
 def get(self, req, lookup):
     """
     Return the list of languages defined on config file.
     """
     languages = superdesk.get_resource_service("vocabularies").get_languages()
     return ListCursor([view_language(lang) for lang in languages])
Example #42
0
 def clear_desk_ref(self, doc, field):
     desk = get_resource_service('desks').find_one(_id=doc.get('desk'),
                                                   req=None)
     if desk:
         get_resource_service('desks').update(doc.get('desk'),
                                              {field: None}, desk)
Example #43
0
 def on_created(self, docs):
     for doc in docs:
         push_notification(self.notification_key,
                           created=1,
                           desk_id=str(doc.get(config.ID_FIELD)))
         get_resource_service("users").update_stage_visibility_for_users()
Example #44
0
    def _create_rewrite_article(self,
                                original,
                                existing_item=None,
                                desk_id=None):
        """Creates a new story and sets the metadata from original.

        :param dict original: original story
        :param dict existing_item: existing story that is being re-written
        :return:new story
        """
        rewrite = dict()

        fields = [
            'family_id', 'event_id', 'flags', 'language', ASSOCIATIONS, 'extra'
        ]
        existing_item_preserve_fields = (ASSOCIATIONS, 'flags')

        if app.config.get('COPY_ON_REWRITE_FIELDS'):
            fields.extend(app.config['COPY_ON_REWRITE_FIELDS'])

        if existing_item:
            # for associate an existing file as update merge subjects
            subjects = original.get('subject', [])
            unique_subjects = {subject.get('qcode') for subject in subjects}
            rewrite['subject'] = [
                subject for subject in existing_item.get('subject', [])
                if subject.get('qcode') not in unique_subjects
            ]
            rewrite['subject'].extend(subjects)
            rewrite['flags'] = original['flags'] or {}

            # preserve flags
            for key in rewrite.get('flags').keys():
                rewrite['flags'][
                    key] = original['flags'][key] or existing_item.get(
                        'flags', {}).get(key, False)

            original_associations = original.get(ASSOCIATIONS) or {}
            existing_associations = existing_item.get(ASSOCIATIONS) or {}
            rewrite[ASSOCIATIONS] = existing_associations

            # if the existing item has association then preserve the association
            for key, assoc in original_associations.items():
                if not existing_associations.get(key):
                    rewrite[ASSOCIATIONS][key] = assoc
        else:
            # ingest provider and source to be retained for new item
            fields.extend(['ingest_provider', 'source'])

            if original.get('profile'):
                content_type = get_resource_service('content_types').find_one(
                    req=None, _id=original['profile'])
                extended_fields = list(content_type['schema'].keys())
                # extra fields needed.
                extended_fields.extend([
                    'profile', 'keywords', 'target_regions', 'target_types',
                    'target_subscribers'
                ])
            else:
                extended_fields = [
                    'abstract', 'anpa_category', 'pubstatus', 'slugline',
                    'urgency', 'subject', 'priority', 'byline', 'dateline',
                    'headline', 'place', 'genre', 'body_footer',
                    'company_codes', 'keywords', 'target_regions',
                    'target_types', 'target_subscribers'
                ]

            fields.extend(extended_fields)

        for field in fields:
            if original.get(field):
                # don't overwrite some fields in existing items
                if existing_item and field in existing_item_preserve_fields:
                    continue

                rewrite[field] = original[field]

        # if the original was flagged for SMS the rewrite should not be.
        if not existing_item and rewrite.get('flags', {}).get(
                'marked_for_sms', False):
            rewrite['flags']['marked_for_sms'] = False

        # SD-4595 - Default value for the update article to be set based on the system config.
        if config.RESET_PRIORITY_VALUE_FOR_UPDATE_ARTICLES:
            # if True then reset to the default priority value.
            rewrite['priority'] = int(
                config.DEFAULT_PRIORITY_VALUE_FOR_MANUAL_ARTICLES)

        rewrite['rewrite_of'] = original[config.ID_FIELD]
        rewrite['rewrite_sequence'] = (original.get('rewrite_sequence')
                                       or 0) + 1
        rewrite.pop(PROCESSED_FROM, None)

        if not existing_item:
            # send the document to the desk only if a new rewrite is created
            send_to(doc=rewrite,
                    desk_id=(desk_id or original['task']['desk']),
                    default_stage='working_stage',
                    user_id=get_user_id())

            # if we are rewriting a published item then copy the body_html
            if original.get('state', '') in (CONTENT_STATE.PUBLISHED,
                                             CONTENT_STATE.CORRECTED,
                                             CONTENT_STATE.SCHEDULED):
                rewrite['body_html'] = original.get('body_html', '')

        rewrite[ITEM_STATE] = CONTENT_STATE.PROGRESS
        self._set_take_key(rewrite)
        return rewrite
    def link_archive_items_to_assignments(self, assignment, related_items,
                                          actioned_item, doc):
        assignments_service = get_resource_service('assignments')
        delivery_service = get_resource_service('delivery')
        assignments_service.validate_assignment_action(assignment)
        already_completed = assignment['assigned_to'][
            'state'] == ASSIGNMENT_WORKFLOW_STATE.COMPLETED
        items = []
        ids = []
        deliveries = []
        published_updated_items = []
        updates = {'assigned_to': deepcopy(assignment.get('assigned_to'))}
        need_complete = None
        for item in related_items:
            if not item.get('assignment_id') or (item['_id']
                                                 == actioned_item.get('_id')
                                                 and doc.get('force')):
                # Update the delivery for the item if one exists
                delivery = delivery_service.find_one(
                    req=None, item_id=item[config.ID_FIELD])
                if delivery:
                    delivery_service.patch(
                        delivery['_id'], {
                            'assignment_id':
                            assignment['_id'],
                            'scheduled_update_id':
                            assignment.get('scheduled_update_id'),
                        })
                else:
                    # Add a delivery for the item
                    deliveries.append({
                        'item_id':
                        item[config.ID_FIELD],
                        'assignment_id':
                        assignment.get(config.ID_FIELD),
                        'planning_id':
                        assignment['planning_item'],
                        'coverage_id':
                        assignment['coverage_item'],
                        'item_state':
                        item.get('state'),
                        'sequence_no':
                        item.get('rewrite_sequence') or 0,
                        'publish_time':
                        get_delivery_publish_time(item),
                        'scheduled_update_id':
                        assignment.get('scheduled_update_id'),
                    })

                # Update archive/published collection with assignment linking
                update_assignment_on_link_unlink(assignment[config.ID_FIELD],
                                                 item, published_updated_items)

                ids.append(item.get(config.ID_FIELD))
                items.append(item)

                if item.get(ITEM_STATE) in [CONTENT_STATE.PUBLISHED, CONTENT_STATE.CORRECTED] and \
                        not assignment.get('scheduled_update_id') and \
                        assignment['assigned_to']['state'] != ASSIGNMENT_WORKFLOW_STATE.COMPLETED:
                    # If assignment belongs to coverage, 'complete' it if any news item is published
                    need_complete = True

        # Create all deliveries
        if len(deliveries) > 0:
            delivery_service.post(deliveries)

        self.update_assignment(updates, assignment, actioned_item,
                               doc.pop('reassign', None), already_completed,
                               need_complete)
        actioned_item['assignment_id'] = assignment[config.ID_FIELD]
        doc.update(actioned_item)

        # Save assignment history
        # Update assignment history with all items affected
        if len(ids) > 0:
            updates['assigned_to']['item_ids'] = ids
            if not assignment.get('scheduled_update_id'):
                assignment_history_service = get_resource_service(
                    'assignments_history')
                assignment_history_service.on_item_content_link(
                    updates, assignment)

            if (actioned_item.get(ITEM_STATE)
                    not in [CONTENT_STATE.PUBLISHED, CONTENT_STATE.CORRECTED]
                    or already_completed) and not need_complete:
                # publishing planning item
                assignments_service.publish_planning(
                    assignment['planning_item'])

        # Send notifications
        push_content_notification(items)
        push_notification('content:link',
                          item=str(actioned_item[config.ID_FIELD]),
                          assignment=assignment[config.ID_FIELD])
        return ids
Example #46
0
    def _users_aggregation(self, desk_id: str) -> List[Dict]:
        desks_service = superdesk.get_resource_service("desks")

        es_query: Dict[str, Any]
        es_assign_query: Dict[str, Any]
        desk_filter: Dict[str, Any]

        if desk_id == "all":
            desk_filter = {}
            es_query = {}
        else:
            desk_filter = {"_id": ObjectId(desk_id)}
            es_query = {"filter": [{"term": {"task.desk": desk_id}}]}

        req = ParsedRequest()
        req.projection = json.dumps({"members": 1})
        found = desks_service.get(req, desk_filter)
        members = set()
        for d in found:
            members.update({m["user"] for m in d["members"]})

        users_aggregation = app.data.pymongo().db.users.aggregate([
            {
                "$match": {
                    "_id": {
                        "$in": list(members)
                    }
                }
            },
            {
                "$group": {
                    "_id": "$role",
                    "authors": {
                        "$addToSet": "$_id"
                    }
                }
            },
        ])

        # only do aggregations on content accesible by user
        content_filters = superdesk.get_resource_service(
            "search").get_archive_filters()
        if content_filters:
            es_query.setdefault("filter", []).extend(content_filters)

        # first we check archives for locked items
        es_query["aggs"] = {
            "desk_authors": {
                "filter": {
                    "bool": {
                        "filter": {
                            "terms": {
                                "lock_user": [str(m) for m in members]
                            }
                        }
                    }
                },
                "aggs": {
                    "authors": {
                        "terms": {
                            "field": "lock_user",
                            "size": SIZE_MAX
                        },
                        "aggs": {
                            "locked": {
                                "filter": {
                                    "exists": {
                                        "field": "lock_user",
                                    }
                                }
                            },
                        },
                    }
                },
            }
        }
        docs_agg = app.data.elastic.search(es_query,
                                           "archive",
                                           params={"size": 0})
        stats_by_authors = {}
        for a in docs_agg.hits["aggregations"]["desk_authors"]["authors"][
                "buckets"]:
            stats_by_authors[a["key"]] = {
                "locked": a["locked"]["doc_count"],
                "assigned": 0,
            }

        # then assignments
        if desk_id == "all":
            desk_filter = {}
            es_assign_query = {}
        else:
            desk_filter = {"_id": ObjectId(desk_id)}
            es_assign_query = {
                "filter": {
                    "term": {
                        "assigned_to.desk": desk_id
                    }
                }
            }
        es_assign_query["aggs"] = {
            "desk_authors": {
                "filter": {
                    "terms": {
                        "assigned_to.user": [str(m) for m in members]
                    }
                },
                "aggs": {
                    "authors": {
                        "terms": {
                            "field": "assigned_to.user",
                            "size": SIZE_MAX
                        },
                    }
                },
            }
        }
        try:
            assign_agg = app.data.elastic.search(es_assign_query,
                                                 "assignments",
                                                 params={"size": 0})
        except KeyError:
            logger.warning(
                'Can\'t access "assignments" collection, planning is probably not installed'
            )
        else:
            for a in assign_agg.hits["aggregations"]["desk_authors"][
                    "authors"]["buckets"]:
                stats_by_authors.setdefault(
                    a["key"], {"locked": 0})["assigned"] = a["doc_count"]

        overview = []
        for a in users_aggregation:
            role = a["_id"]
            authors_dict: Dict[str, Any] = {}
            role_dict = {
                "role": role,
                "authors": authors_dict,
            }
            authors = a["authors"]
            for author in authors:
                author = str(author)
                try:
                    authors_dict[author] = stats_by_authors[author]
                except KeyError:
                    logger.debug(
                        "No article found for {author}".format(author=author))
                    authors_dict[author] = {"assigned": 0, "locked": 0}
            overview.append(role_dict)

        return overview
Example #47
0
def init_default_content_profile(doc):
    if not doc.get('profile'):
        desk_id = doc.get('task', {}).get('desk')
        desk = get_resource_service('desks').find_one(req=None, _id=desk_id)
        doc['profile'] = desk.get('default_content_profile')
    def setUp(self):
        try:
            from apps.legal_archive.commands import ImportLegalArchiveCommand
        except ImportError:
            self.fail("Could not import class under test (ImportLegalArchiveCommand).")
        else:
            self.class_under_test = ImportLegalArchiveCommand
            self.app.data.insert("desks", self.desks)
            self.app.data.insert("users", self.users)
            self.validators = [
                {"schema": {}, "type": "text", "act": "publish", "_id": "publish_text"},
                {"schema": {}, "type": "text", "act": "correct", "_id": "correct_text"},
                {"schema": {}, "type": "text", "act": "kill", "_id": "kill_text"},
            ]

            self.products = [
                {"_id": "1", "name": "prod1"},
                {"_id": "2", "name": "prod2", "codes": "abc,def"},
                {"_id": "3", "name": "prod3", "codes": "xyz"},
            ]

            self.subscribers = [
                {
                    "name": "Test",
                    "is_active": True,
                    "subscriber_type": "wire",
                    "email": "*****@*****.**",
                    "sequence_num_settings": {"max": 9999, "min": 1},
                    "products": ["1"],
                    "destinations": [
                        {
                            "name": "test",
                            "delivery_type": "email",
                            "format": "nitf",
                            "config": {"recipients": "*****@*****.**"},
                        }
                    ],
                }
            ]
            self.app.data.insert("validators", self.validators)
            self.app.data.insert("products", self.products)
            self.app.data.insert("subscribers", self.subscribers)
            self.class_under_test = ImportLegalArchiveCommand
            self.archive_items = [
                {
                    "task": {"desk": self.desks[0]["_id"], "stage": self.desks[0]["incoming_stage"], "user": "******"},
                    "_id": "item1",
                    "state": "in_progress",
                    "headline": "item 1",
                    "type": "text",
                    "slugline": "item 1 slugline",
                    "_current_version": 1,
                    "_created": utcnow() - timedelta(minutes=3),
                    "expired": utcnow() - timedelta(minutes=30),
                },
                {
                    "task": {"desk": self.desks[0]["_id"], "stage": self.desks[0]["incoming_stage"], "user": "******"},
                    "_id": "item2",
                    "state": "in_progress",
                    "headline": "item 2",
                    "type": "text",
                    "slugline": "item 2 slugline",
                    "_current_version": 1,
                    "_created": utcnow() - timedelta(minutes=2),
                    "expired": utcnow() - timedelta(minutes=30),
                },
                {
                    "task": {"desk": self.desks[0]["_id"], "stage": self.desks[0]["incoming_stage"], "user": "******"},
                    "_id": "item3",
                    "state": "in_progress",
                    "headline": "item 2",
                    "type": "text",
                    "slugline": "item 2 slugline",
                    "_current_version": 1,
                    "_created": utcnow() - timedelta(minutes=1),
                    "expired": utcnow() - timedelta(minutes=30),
                },
            ]

            get_resource_service(ARCHIVE).post(self.archive_items)
            for item in self.archive_items:
                resolve_document_version(item, ARCHIVE, "POST")
                insert_into_versions(id_=item["_id"])
Example #49
0
    def import_file(self,
                    entity_name,
                    path,
                    file_name,
                    index_params,
                    do_patch=False,
                    force=False):
        """Imports seed data based on the entity_name (resource name) from the file_name specified.

        index_params use to create index for that entity/resource

        :param str entity_name: name of the resource
        :param str file_name: file name that contains seed data
        :param list index_params: list of indexes that is created on that entity.
        For example:
        [[("first_name", pymongo.ASCENDING), ("last_name", pymongo.ASCENDING)], "username"] will create two indexes
        - composite index of "first_name", "last_name" field.
        - index on username field.
        Alternatively index param can be specified as
        [[("first_name", pymongo.ASCENDING), ("last_name", pymongo.ASCENDING)], [("username", pymongo.ASCENDING)]]
        Refer to pymongo create_index documentation for more information.
        http://api.mongodb.org/python/current/api/pymongo/collection.html
        :param bool do_patch: if True then patch the document else don't patch.
        """
        logger.info("Process %r", entity_name)
        file_path = file_name and get_filepath(file_name, path)
        if not file_path:
            pass
        elif not file_path.exists():
            logger.info(" - file not exists: %s", file_path)
        else:
            logger.info(" - got file path: %s", file_path)
            with file_path.open("rt", encoding="utf-8") as app_prepopulation:
                service = superdesk.get_resource_service(entity_name)
                json_data = json.loads(app_prepopulation.read())
                data = [fillEnvironmentVariables(item) for item in json_data]
                data = [
                    app.data.mongo._mongotize(item, service.datasource)
                    for item in data if item
                ]
                existing_data = []
                existing = service.get_from_mongo(None, {})
                update_data = True
                if not do_patch and existing.count() > 0:
                    logger.info(" - data already exists none will be loaded")
                    update_data = False
                elif do_patch and existing.count() > 0:
                    logger.info(" - data already exists it will be updated")

                if update_data:
                    if do_patch:
                        for item in existing:
                            for loaded_item in data:
                                if "_id" in loaded_item and loaded_item[
                                        "_id"] == item["_id"]:
                                    data.remove(loaded_item)
                                    if force or item.get("init_version",
                                                         0) < loaded_item.get(
                                                             "init_version",
                                                             0):
                                        existing_data.append(loaded_item)

                    if data:
                        for item in data:
                            if not item.get(config.ETAG):
                                item.setdefault(config.ETAG, "init")
                        service.post(data)

                    if existing_data and do_patch:
                        for item in existing_data:
                            item["_etag"] = "init"
                            service.update(
                                item["_id"], item,
                                service.find_one(None, _id=item["_id"]))

                logger.info(" - file imported successfully: %s", file_name)

        if index_params:
            for index in index_params:
                crt_index = list(index) if isinstance(index, list) else index
                options = crt_index.pop() if isinstance(
                    crt_index[-1], dict) and isinstance(index, list) else {}
                collection = app.data.mongo.pymongo(
                    resource=entity_name).db[entity_name]
                options.setdefault("background", True)
                index_name = collection.create_index(crt_index, **options)
                logger.info(
                    " - index: %s for collection %s created successfully.",
                    index_name, entity_name)
    def _validate(self, doc):
        assignment = get_resource_service('assignments').find_one(
            req=None, _id=doc.get('assignment_id'))

        if not assignment:
            raise SuperdeskApiError.badRequestError('Assignment not found.')

        item = get_resource_service('archive').find_one(req=None,
                                                        _id=doc.get('item_id'))

        if not item:
            raise SuperdeskApiError.badRequestError('Content item not found.')

        if not doc.get('force') and item.get('assignment_id'):
            raise SuperdeskApiError.badRequestError(
                'Content is already linked to an assignment. Cannot link assignment and content.'
            )

        if not is_assigned_to_a_desk(item):
            raise SuperdeskApiError.badRequestError(
                'Content not in workflow. Cannot link assignment and content.')

        if not item.get('rewrite_of'):
            delivery = get_resource_service('delivery').find_one(
                req=None, assignment_id=ObjectId(doc.get('assignment_id')))

            if delivery:
                raise SuperdeskApiError.badRequestError(
                    'Content already exists for the assignment. Cannot link assignment and content.'
                )

            # scheduled update validation
            if assignment.get('scheduled_update_id'):
                raise SuperdeskApiError.badRequestError(
                    'Only updates can be linked to a scheduled update assignment'
                )

        coverage = get_coverage_for_assignment(assignment)
        allowed_states = [
            ASSIGNMENT_WORKFLOW_STATE.IN_PROGRESS,
            ASSIGNMENT_WORKFLOW_STATE.COMPLETED
        ]
        if (coverage and len(coverage.get('scheduled_updates')) > 0
                and str(assignment['_id']) != str(
                    (coverage.get('assigned_to') or {}).get('assignment_id'))):
            if (coverage.get('assigned_to')
                    or {}).get('state') not in allowed_states:
                raise SuperdeskApiError(
                    'Previous coverage is not linked to content.')

            # Check all previous scheduled updated to be linked/completed
            for s in coverage.get('scheduled_updates'):
                assigned_to = (s.get('assigned_to') or {})
                if str(assigned_to.get('assignment_id')) == str(
                        doc.get('assignment_id')):
                    break

                if assigned_to.get('state') not in allowed_states:
                    raise SuperdeskApiError(
                        'Previous scheduled-update pending content-linking/completion'
                    )
def generate_text_item(items, template_name, resource_type):
    template = get_resource_service('planning_export_templates').get_export_template(template_name, resource_type)
    archive_service = get_resource_service('archive')
    if not template:
        raise SuperdeskApiError.badRequestError('Invalid template selected')

    for item in items:
        # Create list of assignee with preference to coverage_provider, if not, assigned user
        item['published_archive_items'] = []
        item['assignees'] = []
        item['text_assignees'] = []
        item['contacts'] = []
        text_users = []
        text_desks = []
        users = []
        desks = []

        def enhance_coverage(planning, item, users):
            for c in (planning.get('coverages') or []):
                is_text = c.get('planning', {}).get('g2_content_type', '') == 'text'
                completed = (c.get('assigned_to') or {}).get('state') == ASSIGNMENT_WORKFLOW_STATE.COMPLETED
                assigned_to = c.get('assigned_to') or {}
                user = None
                desk = None
                if assigned_to.get('coverage_provider'):
                    item['assignees'].append(assigned_to['coverage_provider']['name'])
                    if is_text and not completed:
                        item['text_assignees'].append(assigned_to['coverage_provider']['name'])
                elif assigned_to.get('user'):
                    user = assigned_to['user']
                    users.append(user)
                elif assigned_to.get('desk'):
                    desk = assigned_to.get('desk')
                    desks.append(desk)

                # Get abstract from related text item if coverage is 'complete'
                if is_text:
                    if completed:
                        results = list(archive_service.get_from_mongo(req=None,
                                                                      lookup={
                                                                          'assignment_id': ObjectId(
                                                                              c['assigned_to']['assignment_id']),
                                                                          'state': {'$in': ['published', 'corrected']},
                                                                          'pubstatus': 'usable',
                                                                          'rewrite_of': None
                                                                      }))
                        if len(results) > 0:
                            item['published_archive_items'].append({
                                'archive_text': get_first_paragraph_text(results[0].get('abstract')) or '',
                                'archive_slugline': results[0].get('slugline') or ''
                            })
                    elif c.get('news_coverage_status', {}).get('qcode') == 'ncostat:int':
                        if user:
                            text_users.append(user)
                        else:
                            text_desks.append(desk)

            item['contacts'] = get_contacts_from_item(item)

        if resource_type == 'planning':
            enhance_coverage(item, item, users)
        else:
            for p in (item.get('plannings') or []):
                enhance_coverage(p, item, users)

        users = get_resource_service('users').find(where={
            '_id': {'$in': users}
        })

        desks = get_resource_service('desks').find(where={
            '_id': {'$in': desks}
        })

        for u in users:
            name = "{0} {1}".format(u.get('last_name'), u.get('first_name'))
            item['assignees'].append(name)
            if str(u['_id']) in text_users:
                item['text_assignees'].append(name)

        for d in desks:
            item['assignees'].append(d['name'])
            if str(d['_id']) in text_desks:
                item['text_assignees'].append(d['name'])

        set_item_place(item)

        item['description_text'] = item.get('description_text') or (item.get('event') or {}).get('definition_short')
        item['slugline'] = item.get('slugline') or (item.get('event') or {}).get('name')

        # Handle dates and remote time-zones
        if item.get('dates') or (item.get('event') or {}).get('dates'):
            dates = item.get('dates') or item.get('event').get('dates')
            item['schedule'] = utc_to_local(config.DEFAULT_TIMEZONE, dates.get('start'))
            if get_timezone_offset(config.DEFAULT_TIMEZONE, utcnow()) !=\
                    get_timezone_offset(dates.get('tz'), utcnow()):
                item['schedule'] = "{} ({})".format(item['schedule'].strftime('%H%M'), item['schedule'].tzname())
            else:
                item['schedule'] = item['schedule'].strftime('%H%M')

    agendas = []
    if resource_type == 'planning':
        agendas = group_items_by_agenda(items)
        inject_internal_converages(items)

        labels = {}
        cv = get_resource_service('vocabularies').find_one(req=None, _id='g2_content_type')
        if cv:
            labels = {_type['qcode']: _type['name'] for _type in cv['items']}

        for item in items:
            item['coverages'] = [labels.get(coverage.get('planning').get('g2_content_type'),
                                            coverage.get('planning').get('g2_content_type')) +
                                 (' (cancelled)' if coverage.get('workflow_status', '') == 'cancelled' else '')
                                 for coverage in item.get('coverages', [])
                                 if (coverage.get('planning') or {}).get('g2_content_type')]

    article = {}

    for key, value in template.items():
        if value.endswith(".html"):
            article[key.replace('_template', '')] = render_template(value, items=items, agendas=agendas)
        else:
            article[key] = render_template_string(value, items=items, agendas=agendas)

    return article
Example #52
0
 def _get_vocabulary_display_name(self, vocabulary_id):
     vocabulary = get_resource_service('vocabularies').find_one(
         req=None, _id=vocabulary_id)
     if vocabulary and 'display_name' in vocabulary:
         return vocabulary['display_name']
     return vocabulary_id
Example #53
0
    def test_belga_keywords(self):
        self.app.data.insert(
            'desks', [{
                '_id': ObjectId('5d385f17fe985ec5e1a78b49'),
                'name': 'Politic Desk',
                'default_content_profile': 'belga_text',
                'default_content_template': 'content_template_1',
                'desk_language': 'fr',
                'source': 'politic'
            }])
        self.app.data.insert('stages',
                             [{
                                 '_id': ObjectId('5d385f31fe985ec67a0ca583'),
                                 'name': 'Incoming Stage',
                                 'default_incoming': True,
                                 'desk_order': 2,
                                 'content_expiry': None,
                                 'working_stage': False,
                                 'is_visible': True,
                                 'desk': ObjectId('5d385f17fe985ec5e1a78b49')
                             }])
        self.app.data.insert('vocabularies', [{
            "_id":
            "belga-keywords",
            "display_name":
            "Belga Keywords",
            "type":
            "manageable",
            "selection_type":
            "multi selection",
            "unique_field":
            "qcode",
            "schema": {
                "name": {},
                "qcode": {},
                "translations": {}
            },
            "service": {
                "all": 1
            },
            "items": [{
                "name": "BRIEF",
                "qcode": "BRIEF",
                "is_active": True,
                "translations": {
                    "name": {
                        "nl": "BRIEF",
                        "fr": "BRIEF"
                    }
                }
            }, {
                "name": "PREVIEW",
                "qcode": "PREVIEW",
                "is_active": True,
                "translations": {
                    "name": {
                        "nl": "VOORBERICHT",
                        "fr": "AVANT-PAPIER"
                    }
                }
            }]
        }])
        self.app.data.insert('content_templates', [{
            '_id': 'content_template_1',
            'template_name': 'belga text',
            'is_public': True,
            'data': {
                'profile': 'belga_text',
                'type': 'text',
                'pubstatus': 'usable',
                'format': 'HTML',
                'headline': '',
                'language': 'en',
                'keywords': ['some', 'keyword'],
                'body_html': ''
            },
            'template_type': 'create',
        }])
        item = {
            '_id':
            'urn:newsml:localhost:5000:2019-12-10T14:43:46.224107:d13ac5ae-7f43-4b7f-89a5-2c6835389564',
            'guid':
            'urn:newsml:localhost:5000:2019-12-10T14:43:46.224107:d13ac5ae-7f43-4b7f-89a5-2c6835389564',
            'headline':
            'test headline',
            'slugine':
            'test slugline',
            'state':
            'published',
            'type':
            'text',
            "subject": [{
                'name': 'BRIEF',
                'qcode': 'BRIEF',
                'translations': {
                    'name': {
                        'nl': 'BRIEF',
                        'fr': 'BRIEF'
                    }
                },
                'scheme': 'belga-keywords'
            }],
            'keywords': ['foo', 'bar'],
            'language':
            'fr'
        }
        self.app.data.insert('archive', [item])
        self.assertRaises(StopDuplication,
                          set_default_metadata_with_translate,
                          item,
                          dest_desk_id=ObjectId('5d385f17fe985ec5e1a78b49'),
                          dest_stage_id=ObjectId('5d385f31fe985ec67a0ca583'))
        archive_service = get_resource_service('archive')
        new_item = archive_service.find_one(
            req=None,
            original_id=
            'urn:newsml:localhost:5000:2019-12-10T14:43:46.224107:d13ac5ae-7f43-4b7f-89a5-2c6835389564'
        )

        self.assertEqual(item["subject"], new_item["subject"])
def get_desk_template(desk):
    default_content_template = desk.get('default_content_template')
    if default_content_template:
        return get_resource_service('content_templates').find_one(req=None, _id=default_content_template)

    return {}
 def _get_content_filters_by_content_filter(self, content_filter_id):
     lookup = {'content_filter.expression.pf': {'$in': [content_filter_id]}}
     content_filters = get_resource_service('content_filters').get(
         req=None, lookup=lookup)
     return content_filters
Example #56
0
    def test_duplicate(self):
        self.app.data.insert(
            'desks', [{
                '_id': ObjectId('5d385f17fe985ec5e1a78b49'),
                'name': 'Politic Desk',
                'default_content_profile': 'belga_text',
                'default_content_template': 'content_template_1',
                'desk_language': 'fr',
                'source': 'politic'
            }])
        self.app.data.insert('stages',
                             [{
                                 '_id': ObjectId('5d385f31fe985ec67a0ca583'),
                                 'name': 'Incoming Stage',
                                 'default_incoming': True,
                                 'desk_order': 2,
                                 'content_expiry': None,
                                 'working_stage': False,
                                 'is_visible': True,
                                 'desk': ObjectId('5d385f17fe985ec5e1a78b49')
                             }])
        self.app.data.insert('content_templates', [{
            '_id': 'content_template_1',
            'template_name': 'belga text',
            'is_public': True,
            'data': {
                'profile':
                'belga_text',
                'type':
                'text',
                'pubstatus':
                'usable',
                'format':
                'HTML',
                'headline':
                '',
                'subject': [
                    {
                        'name': 'INT/GENERAL',
                        'qcode': 'INT/GENERAL',
                        'parent': 'INT',
                        'scheme': 'services-products'
                    },
                    {
                        'name': 'default',
                        'qcode': 'default',
                        'scheme': 'distribution'
                    },
                ],
                'language':
                'en',
                'keywords': ['some', 'keyword'],
                'body_html':
                ''
            },
            'template_type': 'create',
        }])
        item = {
            '_id':
            'urn:newsml:localhost:5000:2019-12-10T14:43:46.224107:d13ac5ae-7f43-4b7f-89a5-2c6835389564',
            'guid':
            'urn:newsml:localhost:5000:2019-12-10T14:43:46.224107:d13ac5ae-7f43-4b7f-89a5-2c6835389564',
            'headline': 'test headline',
            'slugine': 'test slugline',
            'state': 'published',
            'type': 'text',
            'keywords': ['foo', 'bar'],
            'language': 'en'
        }
        self.app.data.insert('archive', [item])
        self.assertRaises(StopDuplication,
                          set_default_metadata_with_translate,
                          item,
                          dest_desk_id=ObjectId('5d385f17fe985ec5e1a78b49'),
                          dest_stage_id=ObjectId('5d385f31fe985ec67a0ca583'))

        archive_service = get_resource_service('archive')
        new_item = archive_service.find_one(
            req=None,
            original_id=
            'urn:newsml:localhost:5000:2019-12-10T14:43:46.224107:d13ac5ae-7f43-4b7f-89a5-2c6835389564'
        )
        self.assertNotIn('translated_from', new_item)
Example #57
0
def archive_item(self, guid, provider_id, user, task_id=None):
    try:
        # For CELERY_ALWAYS_EAGER=True the current request context is
        # empty but already initialized one is on request_stack
        if app.config['CELERY_ALWAYS_EAGER']:
            self.request_stack.pop()

        crt_task_id = self.request.id
        if not task_id:
            task_id = crt_task_id

        if not self.request.retries:
            update_status(*add_subtask_to_progress(task_id))

        provider = superdesk.get_resource_service('ingest_providers').find_one(
            req=None, _id=provider_id)
        if provider is None:
            message = 'For ingest with guid= %s, failed to retrieve provider with _id=%s' % (
                guid, provider_id)
            raise_fail(task_id, message)
        service_provider = superdesk.io.providers[provider.get('type')]
        service_provider.provider = provider

        item = None
        old_item = False
        try:
            items = service_provider.get_items(guid)
        except LookupError:
            ingest_doc = superdesk.get_resource_service('ingest').find_one(
                req=None, _id=guid)
            if not ingest_doc:
                message = 'Not found the ingest with guid: %s for provider %s' % (
                    guid, provider.get('type'))
                raise_fail(task_id, message)
            else:
                old_item = True
                ingest_doc.pop('_id')
                items = [ingest_doc]
        except Exception:
            raise self.retry(countdown=2)

        for item_it in items:
            if 'guid' in item_it and item_it['guid'] == guid:
                item = item_it
                break

        if item is None:
            message = 'Returned ingest but not found the ingest with guid: %s for provider %s' \
                      % (guid, provider.get('type'))
            raise_fail(task_id, message)

        if not old_item:
            item['created'] = item['firstcreated'] = utc.localize(
                item['firstcreated'])
            item['updated'] = item['versioncreated'] = utc.localize(
                item['versioncreated'])
        '''
        Necessary because flask.g.user is None while fetching packages the for grouped items or
        while patching in archive collection. Without this version_creator is set None which doesn't make sense.
        '''
        flask.g.user = user
        remove_unwanted(item)
        superdesk.get_resource_service(ARCHIVE).patch(guid, item)

        tasks = []
        for group in item.get('groups', []):
            for ref in group.get('refs', []):
                if 'residRef' in ref:
                    resid_ref = ref.get('residRef')
                    doc = {
                        'guid': resid_ref,
                        'ingest_provider': provider_id,
                        'task_id': crt_task_id
                    }

                    archived_doc = superdesk.get_resource_service(
                        ARCHIVE).find_one(req=None, guid=doc.get('guid'))
                    # check if task already started
                    if not archived_doc:
                        doc.setdefault('_id', doc.get('guid'))
                        superdesk.get_resource_service(ARCHIVE).post([doc])
                    elif archived_doc.get('task_id') == crt_task_id:
                        # it is a retry so continue
                        archived_doc.update(doc)
                        remove_unwanted(archived_doc)
                        superdesk.get_resource_service(ARCHIVE).patch(
                            archived_doc.get('_id'), archived_doc)
                    else:
                        # there is a cyclic dependency, skip it
                        continue

                    mark_ingest_as_archived(doc.get('guid'))
                    tasks.append(
                        archive_item.s(resid_ref, provider.get('_id'), user,
                                       task_id))

        for rendition in item.get('renditions', {}).values():
            href = service_provider.prepare_href(rendition['href'])
            if rendition['rendition'] == 'baseImage':
                tasks.append(archive_media.s(task_id, guid, href))
            else:
                tasks.append(
                    archive_rendition.s(task_id, guid, rendition['rendition'],
                                        href))

        update_status(*finish_subtask_from_progress(task_id))
        if tasks:
            chord((task for task in tasks),
                  update_item.s(crt_task_id == task_id, task_id,
                                guid)).delay()
        else:
            insert_into_versions(guid, task_id)
            if task_id == crt_task_id:
                update_status(*finish_task_for_progress(task_id))
    except Exception:
        logger.error(traceback.format_exc())
    def _process_bunch(self, x):
        # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href')
        items = []
        for doc in x.findall('dc_rest_docs/dc_rest_doc'):
            try:
                # print(doc.get('href'))
                id = doc.find('dcdossier').get('id')
                if self._direction:
                    if int(id) > self._id:
                        self._id = int(id)
                else:
                    if int(id) < self._id:
                        self._id = int(id)
                item = {}
                item['guid'] = doc.find('dcdossier').get('guid')
                item[ITEM_TYPE] = CONTENT_TYPE.TEXT
                format = self._get_head_value(doc, 'Format')
                if format == 't':
                    item[FORMAT] = FORMATS.PRESERVED
                else:
                    item[FORMAT] = FORMATS.HTML
                # item[FORMAT] = FORMATS.HTML

                # if the item has been modified in the archive then it is due to a kill
                # there is an argument that this item should not be imported at all
                if doc.find('dcdossier').get('created') != doc.find(
                        'dcdossier').get('modified'):
                    # item[ITEM_STATE] = CONTENT_STATE.KILLED
                    continue
                else:
                    item[ITEM_STATE] = CONTENT_STATE.PUBLISHED

                value = datetime.strptime(
                    self._get_head_value(doc, 'PublicationDate'),
                    '%Y%m%d%H%M%S')
                local_tz = pytz.timezone('Australia/Sydney')
                try:
                    aus_dt = local_tz.localize(value, is_dst=None)
                except NonExistentTimeError as ex:
                    aus_dt = local_tz.localize(value, is_dst=True)
                except AmbiguousTimeError:
                    aus_dt = local_tz.localize(value, is_dst=False)

                item['firstcreated'] = aus_dt.astimezone(pytz.utc)
                item['versioncreated'] = item['firstcreated']

                generate_unique_id_and_name(item)
                item['ingest_id'] = id

                last_line = None
                el = doc.find('dcdossier/document/body/BodyText')
                if el is not None:
                    story = el.text
                    lines = story.split('\n')
                    if len(lines) > 0:
                        last_line = lines[-1]
                    if item.get(FORMAT) == FORMATS.HTML:
                        story = story.replace('\n   ', '<p></p>')
                        story = story.replace('\n', '<br>')
                        item['body_html'] = '<p>' + story + '</p>'
                    else:
                        item['body_html'] = '<pre>' + story + '</pre>'
                    try:
                        item['word_count'] = get_text_word_count(
                            item['body_html'])
                    except:
                        pass
                else:
                    # Items with no body are ignored
                    continue

                item['source'] = self._get_head_value(doc, 'Agency')
                # if the source document contains no agency then by definition it is unknown
                if item['source'] is None:
                    item['source'] = 'UNKNOWN'
                else:
                    # check if the source of the document was Newscentre
                    dc_unique = doc.find('dcdossier').get('unique')
                    if dc_unique.startswith('NC.') and last_line is not None:
                        # The AFR summary articles all have agency values 25 chars long
                        if len(item['source']) == 25:
                            item['source'] = 'AAP'
                        # is it a numeric Agency
                        elif self._get_head_value(doc, 'Agency').isdigit():
                            sign_off = last_line.split(' ')
                            if len(sign_off) > 0:
                                item['source'] = sign_off[0].upper()
                            else:
                                item['source'] = sign_off.upper()
                            # clean up what we have extracted
                            if item['source'].startswith('AAP'):
                                item['source'] = 'AAP'
                            else:
                                # make sure it is one of the known values
                                if item['source'] not in {
                                        'AAP', 'AP', 'REUT', 'Asia Pulse',
                                        'DPA', 'AFP', 'RAW', 'NZA', 'NZPA',
                                        'KRT', 'PA', 'PAA', 'SNI', 'REUTERS'
                                }:
                                    print('Source : {}'.format(item['source']))
                                    item['source'] = 'UNKNOWN'

    #            self._addkeywords('AsiaPulseCodes', doc, item)

                byline = self._get_head_value(doc, 'Byline')
                if byline:
                    item['byline'] = byline

                # item['service'] = self._get_head_value(doc,'Service')

                category = self._get_head_value(doc, 'Category')
                if not category:
                    publication_name = self._get_head_value(
                        doc, 'PublicationName')
                    if publication_name in pubnames:
                        category = pubnames[publication_name]
                if category:
                    anpacategory = {}
                    anpacategory['qcode'] = category
                    for anpa_category in self._anpa_categories['items']:
                        if anpacategory['qcode'].lower(
                        ) == anpa_category['qcode'].lower():
                            anpacategory = {
                                'qcode': anpacategory['qcode'],
                                'name': anpa_category['name']
                            }
                            break
                    item['anpa_category'] = [anpacategory]

                self._addkeywords('CompanyCodes', doc, item)

                item['keyword'] = self._get_head_value(doc, 'Keyword')
                item['ingest_provider_sequence'] = self._get_head_value(
                    doc, 'Sequence')

                orginal_source = self._get_head_value(doc, 'Author')
                if orginal_source:
                    item['original_source'] = orginal_source

                item['headline'] = self._get_head_value(doc, 'Headline')

                code = self._get_head_value(doc, 'SubjectRefNum')
                if code and len(code) == 7:
                    code = '0' + code
                if code and code in subject_codes:
                    item['subject'] = []
                    item['subject'].append({
                        'qcode': code,
                        'name': subject_codes[code]
                    })
                    try:
                        process_iptc_codes(item, None)
                    except:
                        pass

                slug = self._get_head_value(doc, 'SLUG')
                if slug:
                    item['slugline'] = slug
                else:
                    item['slugline'] = self._get_head_value(doc, 'Keyword')

                take_key = self._get_head_value(doc, 'Takekey')
                if take_key:
                    item['anpa_take_key'] = take_key

                self._addkeywords('Topic', doc, item)

                #            self._addkeywords('Selectors', doc, item)

                item['pubstatus'] = 'usable'
                # this is required for the archived service additional lookup
                item['item_id'] = item['guid']
                item[config.VERSION] = 1
                item['flags'] = {'marked_archived_only': True}

                # item['_id'] = ObjectId(id.rjust(24,'0'))
                item['_id'] = ObjectId()
                items.append(item)

                if self._limit:
                    self._limit -= 1
                # print(item)
            except Exception as ex:
                print('Exception parsing DC documnent {}'.format(id))
                pass

        try:
            res = superdesk.get_resource_service('archived')
            s = time.time()
            res.post(items)
            print('Post to Batch to Superdesk took {:.2f}'.format(time.time() -
                                                                  s))
        except Exception as ex:
            if ex.code == 409:
                print('Key clash exceptionn detected')
                # create a list of the guids we tried to post
                guids = [g['guid'] for g in items]
                # create a query for all those id's
                query = {
                    'size': self.BATCH_SIZE,
                    'query': {
                        'filtered': {
                            'filter': {
                                "terms": {
                                    "guid": [guids]
                                }
                            }
                        }
                    }
                }

                req = ParsedRequest()
                repos = 'archived'
                req.args = {'source': json.dumps(query), 'repo': repos}

                search_res = superdesk.get_resource_service('search')
                existing = search_res.get(req=req, lookup=None)
                existing_guids = [e['guid'] for e in existing]
                not_existing = [g for g in guids if g not in existing_guids]
                for missing_guid in not_existing:
                    i = [m for m in items if m['guid'] == missing_guid]
                    original = res.find_one(req=None, guid=i[0]['guid'])
                    if not original:
                        try:
                            s = time.time()
                            res.post(i)
                            print(
                                'Post single item to Superdesk in {:.2f} seconds'
                                .format(time.time() - s))
                        except Exception as ex:
                            print('Exception posting single item')
            else:
                print('Exception posting batch')
Example #59
0
def prepopulate_data(file_name, default_user=None, directory=None):
    if default_user is None:
        default_user = get_default_user()

    if not directory:
        directory = os.path.abspath(os.path.dirname(__file__))
    placeholders = {'NOW()': date_to_str(utcnow())}
    users = {default_user['username']: default_user['password']}
    default_username = default_user['username']
    file = os.path.join(directory, file_name)
    with open(file, 'rt', encoding='utf8') as app_prepopulation:
        json_data = json.load(app_prepopulation)
        for item in json_data:
            resource = item.get('resource', None)
            try:
                service = get_resource_service(resource)
            except KeyError:
                continue  # resource which is not configured - ignore
            username = item.get('username', None) or default_username
            set_logged_user(username, users[username])
            id_name = item.get('id_name', None)
            id_update = item.get('id_update', None)
            text = json.dumps(item.get('data', None))
            text = apply_placeholders(placeholders, text)
            data = json.loads(text)
            if resource:
                app.data.mongo._mongotize(data, resource)
            if resource == 'users':
                users.update({data['username']: data['password']})
            if id_update:
                id_update = apply_placeholders(placeholders, id_update)
                res = service.patch(ObjectId(id_update), data)
                if not res:
                    raise Exception()
            else:
                try:
                    ids = service.post([data])
                except werkzeug.exceptions.Conflict:
                    # instance was already prepopulated
                    break
                except superdesk.errors.SuperdeskApiError as e:
                    logger.exception(e)
                    continue  # an error raised by validation
                if not ids:
                    raise Exception()
                if id_name:
                    placeholders[id_name] = str(ids[0])

            if app.config['VERSION'] in data:
                number_of_versions_to_insert = data[app.config['VERSION']]
                doc_versions = []

                if data[ITEM_STATE] not in [
                        CONTENT_STATE.PUBLISHED, CONTENT_STATE.CORRECTED,
                        CONTENT_STATE.KILLED
                ]:
                    while number_of_versions_to_insert != 0:
                        doc_versions.append(data.copy())
                        number_of_versions_to_insert -= 1
                else:
                    if data[ITEM_STATE] in [
                            CONTENT_STATE.KILLED, CONTENT_STATE.RECALLED,
                            CONTENT_STATE.CORRECTED
                    ]:
                        latest_version = data.copy()
                        doc_versions.append(latest_version)

                        published_version = data.copy()
                        published_version[ITEM_STATE] = CONTENT_STATE.PUBLISHED
                        published_version[ITEM_OPERATION] = 'publish'
                        published_version[app.config[
                            'VERSION']] = number_of_versions_to_insert - 1
                        doc_versions.append(published_version)

                        number_of_versions_to_insert -= 2
                    elif data[ITEM_STATE] == CONTENT_STATE.PUBLISHED:
                        published_version = data.copy()
                        doc_versions.append(published_version)
                        number_of_versions_to_insert -= 1

                    while number_of_versions_to_insert != 0:
                        doc = data.copy()
                        doc[ITEM_STATE] = CONTENT_STATE.PROGRESS
                        doc.pop(ITEM_OPERATION, '')
                        doc[app.
                            config['VERSION']] = number_of_versions_to_insert
                        doc_versions.append(doc)

                        number_of_versions_to_insert -= 1

                insert_versioning_documents(
                    resource, doc_versions if doc_versions else data)
def find_and_replace(item, **kwargs):
    """
    Find and replace words
    :param dict item:
    :param kwargs:
    :return tuple(dict, dict): tuple of modified item and diff of items modified.
    """
    diff = {}

    def repl(new, old):
        """
        Returns a version of the "new" string that matches the case of the "old" string
        :param new:
        :param old:
        :return: a string which is a version of "new" that matches the case of old.
        """
        if old.islower():
            return new.lower()
        elif old.isupper():
            return new.upper()
        else:
            # the old string starts with upper case so we use the title function
            if old[:1].isupper():
                return new.title()
            # it is more complex so try to match it
            else:
                result = ''
                all_upper = True
                for i, c in enumerate(old):
                    if i >= len(new):
                        break
                    if c.isupper():
                        result += new[i].upper()
                    else:
                        result += new[i].lower()
                        all_upper = False
                # append any remaining characters from new
                if all_upper:
                    result += new[i + 1:].upper()
                else:
                    result += new[i + 1:].lower()
                return result

    def do_find_replace(input_string, words_list):
        found_list = {}
        for word in words_list:
            pattern = r'{}'.format(re.escape(word.get('existing', '')))

            while re.search(pattern, input_string, flags=re.IGNORECASE):
                # get the original string from the input
                original = re.search(pattern,
                                     input_string,
                                     flags=re.IGNORECASE).group(0)
                replacement = repl(word.get('replacement', ''), original)
                if found_list.get(original):
                    break
                diff[original] = replacement
                found_list[original] = replacement
                input_string = input_string.replace(original, replacement)

        return input_string

    vocab = get_resource_service('vocabularies').find_one(req=None,
                                                          _id='replace_words')

    if vocab:
        replace_words_list = vocab.get('items') or []

        if not replace_words_list:
            return (item, diff)

        for field in macro_replacement_fields:
            if not item.get(field, None):
                continue

            item[field] = do_find_replace(item[field], replace_words_list)

    return (item, diff)