Beispiel #1
0
def package_show(context, data_dict):
    user = context.get('user')
    package = get_package_object(context, data_dict)
    labels = get_permission_labels()
    user_labels = labels.get_user_dataset_labels(context['auth_user_obj'])
    authorized = any(
        dl in user_labels for dl in labels.get_dataset_labels(package))

    if not authorized:
        return {
            'success': False,
            'msg': _('User %s not authorized to read package %s') % (user, package.id)}
    else:
        return {'success': True}
Beispiel #2
0
    def index_package(self, pkg_dict, defer_commit=False):
        if pkg_dict is None:
            return

        # tracking summary values will be stale, never store them
        tracking_summary = pkg_dict.pop('tracking_summary', None)
        for r in pkg_dict.get('resources', []):
            r.pop('tracking_summary', None)

        data_dict_json = json.dumps(pkg_dict)

        if config.get('ckan.cache_validated_datasets', True):
            package_plugin = lib_plugins.lookup_package_plugin(
                pkg_dict.get('type'))

            schema = package_plugin.show_package_schema()
            validated_pkg_dict, errors = lib_plugins.plugin_validate(
                package_plugin, {
                    'model': model,
                    'session': model.Session
                }, pkg_dict, schema, 'package_show')
            pkg_dict['validated_data_dict'] = json.dumps(
                validated_pkg_dict,
                cls=ckan.lib.navl.dictization_functions.MissingNullEncoder)

        pkg_dict['data_dict'] = data_dict_json

        # add to string field for sorting
        title = pkg_dict.get('title')
        if title:
            pkg_dict['title_string'] = title

        # delete the package if there is no state, or the state is `deleted`
        if (not pkg_dict.get('state') or 'deleted' in pkg_dict.get('state')):
            return self.delete_package(pkg_dict)

        index_fields = RESERVED_FIELDS + list(pkg_dict.keys())

        # include the extras in the main namespace
        extras = pkg_dict.get('extras', [])
        for extra in extras:
            key, value = extra['key'], extra['value']
            if isinstance(value, (tuple, list)):
                value = " ".join(map(text_type, value))
            key = ''.join([c for c in key if c in KEY_CHARS])
            pkg_dict['extras_' + key] = value
            if key not in index_fields:
                pkg_dict[key] = value
        pkg_dict.pop('extras', None)

        # add tags, removing vocab tags from 'tags' list and adding them as
        # vocab_<tag name> so that they can be used in facets
        non_vocab_tag_names = []
        tags = pkg_dict.pop('tags', [])
        context = {'model': model}

        for tag in tags:
            if tag.get('vocabulary_id'):
                data = {'id': tag['vocabulary_id']}
                vocab = logic.get_action('vocabulary_show')(context, data)
                key = u'vocab_%s' % vocab['name']
                if key in pkg_dict:
                    pkg_dict[key].append(tag['name'])
                else:
                    pkg_dict[key] = [tag['name']]
            else:
                non_vocab_tag_names.append(tag['name'])

        pkg_dict['tags'] = non_vocab_tag_names

        # add groups
        groups = pkg_dict.pop('groups', [])

        # we use the capacity to make things private in the search index
        if pkg_dict['private']:
            pkg_dict['capacity'] = 'private'
        else:
            pkg_dict['capacity'] = 'public'

        pkg_dict['groups'] = [group['name'] for group in groups]

        # if there is an owner_org we want to add this to groups for index
        # purposes
        if pkg_dict.get('organization'):
            pkg_dict['organization'] = pkg_dict['organization']['name']
        else:
            pkg_dict['organization'] = None

        # tracking
        if not tracking_summary:
            tracking_summary = model.TrackingSummary.get_for_package(
                pkg_dict['id'])
        pkg_dict['views_total'] = tracking_summary['total']
        pkg_dict['views_recent'] = tracking_summary['recent']

        resource_fields = [('name', 'res_name'),
                           ('description', 'res_description'),
                           ('format', 'res_format'), ('url', 'res_url'),
                           ('resource_type', 'res_type')]
        resource_extras = [(e, 'res_extras_' + e)
                           for e in model.Resource.get_extra_columns()]
        # flatten the structure for indexing:
        for resource in pkg_dict.get('resources', []):
            for (okey, nkey) in resource_fields + resource_extras:
                pkg_dict[nkey] = pkg_dict.get(nkey,
                                              []) + [resource.get(okey, u'')]
        pkg_dict.pop('resources', None)

        rel_dict = collections.defaultdict(list)
        subjects = pkg_dict.pop("relationships_as_subject", [])
        objects = pkg_dict.pop("relationships_as_object", [])
        for rel in objects:
            type = model.PackageRelationship.forward_to_reverse_type(
                rel['type'])
            rel_dict[type].append(
                model.Package.get(rel['subject_package_id']).name)
        for rel in subjects:
            type = rel['type']
            rel_dict[type].append(
                model.Package.get(rel['object_package_id']).name)
        for key, value in six.iteritems(rel_dict):
            if key not in pkg_dict:
                pkg_dict[key] = value

        pkg_dict[TYPE_FIELD] = PACKAGE_TYPE

        # Save dataset type
        pkg_dict['dataset_type'] = pkg_dict['type']

        # clean the dict fixing keys and dates
        # FIXME where are we getting these dirty keys from?  can we not just
        # fix them in the correct place or is this something that always will
        # be needed?  For my data not changing the keys seems to not cause a
        # problem.
        new_dict = {}
        bogus_date = datetime.datetime(1, 1, 1)
        for key, value in pkg_dict.items():
            key = six.ensure_str(key)
            if key.endswith('_date'):
                try:
                    date = parse(value, default=bogus_date)
                    if date != bogus_date:
                        value = date.isoformat() + 'Z'
                    else:
                        # The date field was empty, so dateutil filled it with
                        # the default bogus date
                        value = None
                except (ValueError, IndexError):
                    continue
            new_dict[key] = value
        pkg_dict = new_dict

        for k in ('title', 'notes', 'title_string'):
            if k in pkg_dict and pkg_dict[k]:
                pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k])

        # modify dates (SOLR is quite picky with dates, and only accepts ISO dates
        # with UTC time (i.e trailing Z)
        # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html
        pkg_dict['metadata_created'] += 'Z'
        pkg_dict['metadata_modified'] += 'Z'

        # mark this CKAN instance as data source:
        pkg_dict['site_id'] = config.get('ckan.site_id')

        # Strip a selection of the fields.
        # These fields are possible candidates for sorting search results on,
        # so we strip leading spaces because solr will sort " " before "a" or "A".
        for field_name in ['title']:
            try:
                value = pkg_dict.get(field_name)
                if value:
                    pkg_dict[field_name] = value.lstrip()
            except KeyError:
                pass

        # add a unique index_id to avoid conflicts
        import hashlib
        pkg_dict['index_id'] = hashlib.md5(
            six.b('%s%s' %
                  (pkg_dict['id'], config.get('ckan.site_id')))).hexdigest()

        for item in PluginImplementations(IPackageController):
            pkg_dict = item.before_index(pkg_dict)

        assert pkg_dict, 'Plugin must return non empty package dict on index'

        # permission labels determine visibility in search, can't be set
        # in original dataset or before_index plugins
        labels = lib_plugins.get_permission_labels()
        dataset = model.Package.get(pkg_dict['id'])
        pkg_dict['permission_labels'] = labels.get_dataset_labels(
            dataset) if dataset else []  # TestPackageSearchIndex-workaround

        # send to solr:
        try:
            conn = make_connection()
            commit = not defer_commit
            if not asbool(config.get('ckan.search.solr_commit', 'true')):
                commit = False
            conn.add(docs=[pkg_dict], commit=commit)
        except pysolr.SolrError as e:
            msg = 'Solr returned an error: {0}'.format(
                e.args[0][:1000]  # limit huge responses
            )
            raise SearchIndexError(msg)
        except socket.error as e:
            err = 'Could not connect to Solr using {0}: {1}'.format(
                conn.url, str(e))
            log.error(err)
            raise SearchIndexError(err)

        commit_debug_msg = 'Not committed yet' if defer_commit else 'Committed'
        log.debug('Updated index for %s [%s]' %
                  (pkg_dict.get('name'), commit_debug_msg))
Beispiel #3
0
        include_private = asbool(data_dict.pop('include_private', False))
        include_drafts = asbool(data_dict.pop('include_drafts', False))
        data_dict.setdefault('fq', '')
        if not include_private:
            data_dict['fq'] = '+capacity:public ' + data_dict['fq']
        if include_drafts:
            data_dict['fq'] += ' +state:(active OR draft)'

        # Pop these ones as Solr does not need them
        extras = data_dict.pop('extras', None)

        # enforce permission filter based on user
        if context.get('ignore_auth') or (user and authz.is_sysadmin(user)):
            labels = None
        else:
            labels = lib_plugins.get_permission_labels(
                ).get_user_dataset_labels(context['auth_user_obj'])

        query = search.query_for(model.Package)
        query.run(data_dict, permission_labels=labels)

        # Add them back so extensions can use them on after_search
        data_dict['extras'] = extras

        if result_fl:
            for package in query.results:
                if package.get('extras'):
                    package.update(package['extras'] )
                    package.pop('extras')
                results.append(package)
        else:
            for package in query.results:
def package_search(context, data_dict):
    # sometimes context['schema'] is None
    schema = (context.get('schema')
              or logic.schema.default_package_search_schema())
    data_dict, errors = _validate(data_dict, schema, context)
    # put the extras back into the data_dict so that the search can
    # report needless parameters
    data_dict.update(data_dict.get('__extras', {}))
    data_dict.pop('__extras', None)
    if errors:
        raise ValidationError(errors)

    model = context['model']
    session = context['session']
    user = context.get('user')

    _check_access('package_search', context, data_dict)

    # Move ext_ params to extras and remove them from the root of the search
    # params, so they don't cause and error
    data_dict['extras'] = data_dict.get('extras', {})
    for key in [key for key in data_dict.keys() if key.startswith('ext_')]:
        data_dict['extras'][key] = data_dict.pop(key)

    # check if some extension needs to modify the search params
    for item in plugins.PluginImplementations(plugins.IPackageController):
        data_dict = item.before_search(data_dict)

    # the extension may have decided that it is not necessary to perform
    # the query
    abort = data_dict.get('abort_search', False)

    if data_dict.get('sort') in (None, 'rank'):
        data_dict['sort'] = 'score desc, metadata_modified desc'

    results = []
    if not abort:
        if asbool(data_dict.get('use_default_schema')):
            data_source = 'data_dict'
        else:
            data_source = 'validated_data_dict'
        data_dict.pop('use_default_schema', None)

        result_fl = data_dict.get('fl')
        if not result_fl:
            data_dict['fl'] = 'id {0}'.format(data_source)
        else:
            data_dict['fl'] = ' '.join(result_fl)

        # Remove before these hit solr FIXME: whitelist instead
        include_private = asbool(data_dict.pop('include_private', False))
        include_drafts = asbool(data_dict.pop('include_drafts', False))
        data_dict.setdefault('fq', '')
        if not include_private:
            data_dict['fq'] = '+capacity:public ' + data_dict['fq']
        if include_drafts:
            data_dict['fq'] += ' +state:(active OR draft)'

        # Pop these ones as Solr does not need them
        extras = data_dict.pop('extras', None)

        # enforce permission filter based on user
        if context.get('ignore_auth') or (user and authz.is_sysadmin(user)):
            labels = None
        else:
            labels = lib_plugins.get_permission_labels(
            ).get_user_dataset_labels(context['auth_user_obj'])

        query = PackageSearchQuery()
        query.run(data_dict, permission_labels=labels)

        # Add them back so extensions can use them on after_search
        data_dict['extras'] = extras

        if result_fl and not extras.get('fl_compatible', False):
            for package in query.results:
                if package.get('extras'):
                    package.update(package['extras'])
                    package.pop('extras')
                results.append(package)
        else:
            for package in query.results:
                # get the package object
                package_dict = package.get(data_source)
                ## use data in search index if there
                if package_dict:
                    # the package_dict still needs translating when being viewed
                    package_dict = json.loads(package_dict)
                    if context.get('for_view'):
                        for item in plugins.PluginImplementations(
                                plugins.IPackageController):
                            package_dict = item.before_view(package_dict)
                    results.append(package_dict)
                else:
                    log.error(
                        'No package_dict is coming from solr for package '
                        'id %s', package['id'])

        count = query.count
        facets = query.facets
        raw_solr_results = {
            'results': query.results,
            'highlighting': query.highlighting,
            'count': query.count,
            'facets': query.facets,
        }
    else:
        count = 0
        facets = {}
        results = []
        raw_solr_results = {}

    search_results = {
        'count': count,
        'facets': facets,
        'results': results,
        'sort': data_dict['sort'],
    }

    include_raw_solr_results = False
    for item in plugins.PluginImplementations(plugins.IPackageController):
        if 'include_raw_solr_results' in dir(item):
            include_raw_solr_results = include_raw_solr_results \
                                       or item.include_raw_solr_results(data_dict)

    if include_raw_solr_results:
        search_results['raw_solr_results'] = raw_solr_results

    # create a lookup table of group name to title for all the groups and
    # organizations in the current search's facets.
    group_names = []
    for field_name in ('groups', 'organization'):
        group_names.extend(facets.get(field_name, {}).keys())

    groups = (session.query(model.Group.name, model.Group.title).filter(
        model.Group.name.in_(group_names)).all() if group_names else [])
    group_titles_by_name = dict(groups)

    # Transform facets into a more useful data structure.
    restructured_facets = {}
    for key, value in facets.items():
        restructured_facets[key] = {'title': key, 'items': []}
        for key_, value_ in value.items():
            new_facet_dict = {}
            new_facet_dict['name'] = key_
            if key in ('groups', 'organization'):
                display_name = group_titles_by_name.get(key_, key_)
                display_name = display_name if display_name and display_name.strip(
                ) else key_
                new_facet_dict['display_name'] = display_name
            elif key == 'license_id':
                license = model.Package.get_license_register().get(key_)
                if license:
                    new_facet_dict['display_name'] = license.title
                else:
                    new_facet_dict['display_name'] = key_
            else:
                new_facet_dict['display_name'] = key_
            new_facet_dict['count'] = value_
            restructured_facets[key]['items'].append(new_facet_dict)
    search_results['search_facets'] = restructured_facets

    # check if some extension needs to modify the search results
    for item in plugins.PluginImplementations(plugins.IPackageController):
        search_results = item.after_search(search_results, data_dict)

    # After extensions have had a chance to modify the facets, sort them by
    # display name.
    for facet in search_results['search_facets']:
        search_results['search_facets'][facet]['items'] = sorted(
            search_results['search_facets'][facet]['items'],
            key=lambda facet: facet['display_name'],
            reverse=True)

    return search_results
Beispiel #5
0
    def index_package(self, pkg_dict, defer_commit=False):
        if pkg_dict is None:
            return

        # tracking summary values will be stale, never store them
        tracking_summary = pkg_dict.pop('tracking_summary', None)
        for r in pkg_dict.get('resources', []):
            r.pop('tracking_summary', None)

        data_dict_json = json.dumps(pkg_dict)

        if config.get('ckan.cache_validated_datasets', True):
            package_plugin = lib_plugins.lookup_package_plugin(
                pkg_dict.get('type'))

            schema = package_plugin.show_package_schema()
            validated_pkg_dict, errors = lib_plugins.plugin_validate(
                package_plugin, {'model': model, 'session': model.Session},
                pkg_dict, schema, 'package_show')
            pkg_dict['validated_data_dict'] = json.dumps(validated_pkg_dict,
                cls=ckan.lib.navl.dictization_functions.MissingNullEncoder)

        pkg_dict['data_dict'] = data_dict_json

        # add to string field for sorting
        title = pkg_dict.get('title')
        if title:
            pkg_dict['title_string'] = title

        # delete the package if there is no state, or the state is `deleted`
        if (not pkg_dict.get('state') or 'deleted' in pkg_dict.get('state')):
            # mark as deleted instead of deleting it
            pkg_dict['state'] = 'deleted'
            #return self.delete_package(pkg_dict)

        index_fields = RESERVED_FIELDS + pkg_dict.keys()

        # include the extras in the main namespace
        extras = pkg_dict.get('extras', [])
        for extra in extras:
            key, value = extra['key'], extra['value']
            if isinstance(value, (tuple, list)):
                value = " ".join(map(text_type, value))
            key = ''.join([c for c in key if c in KEY_CHARS])
            pkg_dict['extras_' + key] = value
            if key not in index_fields:
                pkg_dict[key] = value
        pkg_dict.pop('extras', None)

        # add tags, removing vocab tags from 'tags' list and adding them as
        # vocab_<tag name> so that they can be used in facets
        non_vocab_tag_names = []
        tags = pkg_dict.pop('tags', [])
        context = {'model': model}

        for tag in tags:
            if tag.get('vocabulary_id'):
                data = {'id': tag['vocabulary_id']}
                vocab = logic.get_action('vocabulary_show')(context, data)
                key = u'vocab_%s' % vocab['name']
                if key in pkg_dict:
                    pkg_dict[key].append(tag['name'])
                else:
                    pkg_dict[key] = [tag['name']]
            else:
                non_vocab_tag_names.append(tag['name'])

        pkg_dict['tags'] = non_vocab_tag_names

        # add groups
        groups = pkg_dict.pop('groups', [])

        # we use the capacity to make things private in the search index
        if pkg_dict['private']:
            pkg_dict['capacity'] = 'private'
        else:
            pkg_dict['capacity'] = 'public'

        pkg_dict['groups'] = [group['name'] for group in groups]

        # if there is an owner_org we want to add this to groups for index
        # purposes
        if pkg_dict.get('organization'):
           pkg_dict['organization'] = pkg_dict['organization']['name']
        else:
           pkg_dict['organization'] = None

        # tracking
        if not tracking_summary:
            tracking_summary = model.TrackingSummary.get_for_package(
                pkg_dict['id'])
        pkg_dict['views_total'] = tracking_summary['total']
        pkg_dict['views_recent'] = tracking_summary['recent']

        resource_fields = [('name', 'res_name'),
                           ('description', 'res_description'),
                           ('format', 'res_format'),
                           ('url', 'res_url'),
                           ('resource_type', 'res_type')]
        resource_extras = [(e, 'res_extras_' + e) for e
                            in model.Resource.get_extra_columns()]
        # flatten the structure for indexing:
        for resource in pkg_dict.get('resources', []):
            for (okey, nkey) in resource_fields + resource_extras:
                pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')]
        pkg_dict.pop('resources', None)

        rel_dict = collections.defaultdict(list)
        subjects = pkg_dict.pop("relationships_as_subject", [])
        objects = pkg_dict.pop("relationships_as_object", [])
        for rel in objects:
            type = model.PackageRelationship.forward_to_reverse_type(rel['type'])
            rel_dict[type].append(model.Package.get(rel['subject_package_id']).name)
        for rel in subjects:
            type = rel['type']
            rel_dict[type].append(model.Package.get(rel['object_package_id']).name)
        for key, value in rel_dict.iteritems():
            if key not in pkg_dict:
                pkg_dict[key] = value

        pkg_dict[TYPE_FIELD] = PACKAGE_TYPE

        # Save dataset type
        pkg_dict['dataset_type'] = pkg_dict['type']

        # clean the dict fixing keys and dates
        # FIXME where are we getting these dirty keys from?  can we not just
        # fix them in the correct place or is this something that always will
        # be needed?  For my data not changing the keys seems to not cause a
        # problem.
        new_dict = {}
        bogus_date = datetime.datetime(1, 1, 1)
        for key, value in pkg_dict.items():
            key = key.encode('ascii', 'ignore')
            if key.endswith('_date'):
                try:
                    date = parse(value, default=bogus_date)
                    if date != bogus_date:
                        value = date.isoformat() + 'Z'
                    else:
                        # The date field was empty, so dateutil filled it with
                        # the default bogus date
                        value = None
                except ValueError:
                    continue
            new_dict[key] = value
        pkg_dict = new_dict

        for k in ('title', 'notes', 'title_string'):
            if k in pkg_dict and pkg_dict[k]:
                pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k])

        # modify dates (SOLR is quite picky with dates, and only accepts ISO dates
        # with UTC time (i.e trailing Z)
        # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html
        pkg_dict['metadata_created'] += 'Z'
        pkg_dict['metadata_modified'] += 'Z'

        # mark this CKAN instance as data source:
        pkg_dict['site_id'] = config.get('ckan.site_id')

        # Strip a selection of the fields.
        # These fields are possible candidates for sorting search results on,
        # so we strip leading spaces because solr will sort " " before "a" or "A".
        for field_name in ['title']:
            try:
                value = pkg_dict.get(field_name)
                if value:
                    pkg_dict[field_name] = value.lstrip()
            except KeyError:
                pass

        # add a unique index_id to avoid conflicts
        import hashlib
        pkg_dict['index_id'] = hashlib.md5('%s%s' % (pkg_dict['id'],config.get('ckan.site_id'))).hexdigest()

        for item in PluginImplementations(IPackageController):
            pkg_dict = item.before_index(pkg_dict)

        assert pkg_dict, 'Plugin must return non empty package dict on index'

        # permission labels determine visibility in search, can't be set
        # in original dataset or before_index plugins
        labels = lib_plugins.get_permission_labels()
        dataset = model.Package.get(pkg_dict['id'])
        pkg_dict['permission_labels'] = labels.get_dataset_labels(
            dataset) if dataset else [] # TestPackageSearchIndex-workaround

        # send to solr:
        try:
            conn = make_connection()
            commit = not defer_commit
            if not asbool(config.get('ckan.search.solr_commit', 'true')):
                commit = False
            conn.add(docs=[pkg_dict], commit=commit)
        except pysolr.SolrError as e:
            msg = 'Solr returned an error: {0}'.format(
                e[:1000] # limit huge responses
            )
            raise SearchIndexError(msg)
        except socket.error as e:
            err = 'Could not connect to Solr using {0}: {1}'.format(conn.url, str(e))
            log.error(err)
            raise SearchIndexError(err)

        commit_debug_msg = 'Not committed yet' if defer_commit else 'Committed'
        log.debug('Updated index for %s [%s]' % (pkg_dict.get('name'), commit_debug_msg))