def package_show(context, data_dict): user = context.get('user') package = get_package_object(context, data_dict) labels = get_permission_labels() user_labels = labels.get_user_dataset_labels(context['auth_user_obj']) authorized = any( dl in user_labels for dl in labels.get_dataset_labels(package)) if not authorized: return { 'success': False, 'msg': _('User %s not authorized to read package %s') % (user, package.id)} else: return {'success': True}
def index_package(self, pkg_dict, defer_commit=False): if pkg_dict is None: return # tracking summary values will be stale, never store them tracking_summary = pkg_dict.pop('tracking_summary', None) for r in pkg_dict.get('resources', []): r.pop('tracking_summary', None) data_dict_json = json.dumps(pkg_dict) if config.get('ckan.cache_validated_datasets', True): package_plugin = lib_plugins.lookup_package_plugin( pkg_dict.get('type')) schema = package_plugin.show_package_schema() validated_pkg_dict, errors = lib_plugins.plugin_validate( package_plugin, { 'model': model, 'session': model.Session }, pkg_dict, schema, 'package_show') pkg_dict['validated_data_dict'] = json.dumps( validated_pkg_dict, cls=ckan.lib.navl.dictization_functions.MissingNullEncoder) pkg_dict['data_dict'] = data_dict_json # add to string field for sorting title = pkg_dict.get('title') if title: pkg_dict['title_string'] = title # delete the package if there is no state, or the state is `deleted` if (not pkg_dict.get('state') or 'deleted' in pkg_dict.get('state')): return self.delete_package(pkg_dict) index_fields = RESERVED_FIELDS + list(pkg_dict.keys()) # include the extras in the main namespace extras = pkg_dict.get('extras', []) for extra in extras: key, value = extra['key'], extra['value'] if isinstance(value, (tuple, list)): value = " ".join(map(text_type, value)) key = ''.join([c for c in key if c in KEY_CHARS]) pkg_dict['extras_' + key] = value if key not in index_fields: pkg_dict[key] = value pkg_dict.pop('extras', None) # add tags, removing vocab tags from 'tags' list and adding them as # vocab_<tag name> so that they can be used in facets non_vocab_tag_names = [] tags = pkg_dict.pop('tags', []) context = {'model': model} for tag in tags: if tag.get('vocabulary_id'): data = {'id': tag['vocabulary_id']} vocab = logic.get_action('vocabulary_show')(context, data) key = u'vocab_%s' % vocab['name'] if key in pkg_dict: pkg_dict[key].append(tag['name']) else: pkg_dict[key] = [tag['name']] else: non_vocab_tag_names.append(tag['name']) pkg_dict['tags'] = non_vocab_tag_names # add groups groups = pkg_dict.pop('groups', []) # we use the capacity to make things private in the search index if pkg_dict['private']: pkg_dict['capacity'] = 'private' else: pkg_dict['capacity'] = 'public' pkg_dict['groups'] = [group['name'] for group in groups] # if there is an owner_org we want to add this to groups for index # purposes if pkg_dict.get('organization'): pkg_dict['organization'] = pkg_dict['organization']['name'] else: pkg_dict['organization'] = None # tracking if not tracking_summary: tracking_summary = model.TrackingSummary.get_for_package( pkg_dict['id']) pkg_dict['views_total'] = tracking_summary['total'] pkg_dict['views_recent'] = tracking_summary['recent'] resource_fields = [('name', 'res_name'), ('description', 'res_description'), ('format', 'res_format'), ('url', 'res_url'), ('resource_type', 'res_type')] resource_extras = [(e, 'res_extras_' + e) for e in model.Resource.get_extra_columns()] # flatten the structure for indexing: for resource in pkg_dict.get('resources', []): for (okey, nkey) in resource_fields + resource_extras: pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')] pkg_dict.pop('resources', None) rel_dict = collections.defaultdict(list) subjects = pkg_dict.pop("relationships_as_subject", []) objects = pkg_dict.pop("relationships_as_object", []) for rel in objects: type = model.PackageRelationship.forward_to_reverse_type( rel['type']) rel_dict[type].append( model.Package.get(rel['subject_package_id']).name) for rel in subjects: type = rel['type'] rel_dict[type].append( model.Package.get(rel['object_package_id']).name) for key, value in six.iteritems(rel_dict): if key not in pkg_dict: pkg_dict[key] = value pkg_dict[TYPE_FIELD] = PACKAGE_TYPE # Save dataset type pkg_dict['dataset_type'] = pkg_dict['type'] # clean the dict fixing keys and dates # FIXME where are we getting these dirty keys from? can we not just # fix them in the correct place or is this something that always will # be needed? For my data not changing the keys seems to not cause a # problem. new_dict = {} bogus_date = datetime.datetime(1, 1, 1) for key, value in pkg_dict.items(): key = six.ensure_str(key) if key.endswith('_date'): try: date = parse(value, default=bogus_date) if date != bogus_date: value = date.isoformat() + 'Z' else: # The date field was empty, so dateutil filled it with # the default bogus date value = None except (ValueError, IndexError): continue new_dict[key] = value pkg_dict = new_dict for k in ('title', 'notes', 'title_string'): if k in pkg_dict and pkg_dict[k]: pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k]) # modify dates (SOLR is quite picky with dates, and only accepts ISO dates # with UTC time (i.e trailing Z) # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html pkg_dict['metadata_created'] += 'Z' pkg_dict['metadata_modified'] += 'Z' # mark this CKAN instance as data source: pkg_dict['site_id'] = config.get('ckan.site_id') # Strip a selection of the fields. # These fields are possible candidates for sorting search results on, # so we strip leading spaces because solr will sort " " before "a" or "A". for field_name in ['title']: try: value = pkg_dict.get(field_name) if value: pkg_dict[field_name] = value.lstrip() except KeyError: pass # add a unique index_id to avoid conflicts import hashlib pkg_dict['index_id'] = hashlib.md5( six.b('%s%s' % (pkg_dict['id'], config.get('ckan.site_id')))).hexdigest() for item in PluginImplementations(IPackageController): pkg_dict = item.before_index(pkg_dict) assert pkg_dict, 'Plugin must return non empty package dict on index' # permission labels determine visibility in search, can't be set # in original dataset or before_index plugins labels = lib_plugins.get_permission_labels() dataset = model.Package.get(pkg_dict['id']) pkg_dict['permission_labels'] = labels.get_dataset_labels( dataset) if dataset else [] # TestPackageSearchIndex-workaround # send to solr: try: conn = make_connection() commit = not defer_commit if not asbool(config.get('ckan.search.solr_commit', 'true')): commit = False conn.add(docs=[pkg_dict], commit=commit) except pysolr.SolrError as e: msg = 'Solr returned an error: {0}'.format( e.args[0][:1000] # limit huge responses ) raise SearchIndexError(msg) except socket.error as e: err = 'Could not connect to Solr using {0}: {1}'.format( conn.url, str(e)) log.error(err) raise SearchIndexError(err) commit_debug_msg = 'Not committed yet' if defer_commit else 'Committed' log.debug('Updated index for %s [%s]' % (pkg_dict.get('name'), commit_debug_msg))
include_private = asbool(data_dict.pop('include_private', False)) include_drafts = asbool(data_dict.pop('include_drafts', False)) data_dict.setdefault('fq', '') if not include_private: data_dict['fq'] = '+capacity:public ' + data_dict['fq'] if include_drafts: data_dict['fq'] += ' +state:(active OR draft)' # Pop these ones as Solr does not need them extras = data_dict.pop('extras', None) # enforce permission filter based on user if context.get('ignore_auth') or (user and authz.is_sysadmin(user)): labels = None else: labels = lib_plugins.get_permission_labels( ).get_user_dataset_labels(context['auth_user_obj']) query = search.query_for(model.Package) query.run(data_dict, permission_labels=labels) # Add them back so extensions can use them on after_search data_dict['extras'] = extras if result_fl: for package in query.results: if package.get('extras'): package.update(package['extras'] ) package.pop('extras') results.append(package) else: for package in query.results:
def package_search(context, data_dict): # sometimes context['schema'] is None schema = (context.get('schema') or logic.schema.default_package_search_schema()) data_dict, errors = _validate(data_dict, schema, context) # put the extras back into the data_dict so that the search can # report needless parameters data_dict.update(data_dict.get('__extras', {})) data_dict.pop('__extras', None) if errors: raise ValidationError(errors) model = context['model'] session = context['session'] user = context.get('user') _check_access('package_search', context, data_dict) # Move ext_ params to extras and remove them from the root of the search # params, so they don't cause and error data_dict['extras'] = data_dict.get('extras', {}) for key in [key for key in data_dict.keys() if key.startswith('ext_')]: data_dict['extras'][key] = data_dict.pop(key) # check if some extension needs to modify the search params for item in plugins.PluginImplementations(plugins.IPackageController): data_dict = item.before_search(data_dict) # the extension may have decided that it is not necessary to perform # the query abort = data_dict.get('abort_search', False) if data_dict.get('sort') in (None, 'rank'): data_dict['sort'] = 'score desc, metadata_modified desc' results = [] if not abort: if asbool(data_dict.get('use_default_schema')): data_source = 'data_dict' else: data_source = 'validated_data_dict' data_dict.pop('use_default_schema', None) result_fl = data_dict.get('fl') if not result_fl: data_dict['fl'] = 'id {0}'.format(data_source) else: data_dict['fl'] = ' '.join(result_fl) # Remove before these hit solr FIXME: whitelist instead include_private = asbool(data_dict.pop('include_private', False)) include_drafts = asbool(data_dict.pop('include_drafts', False)) data_dict.setdefault('fq', '') if not include_private: data_dict['fq'] = '+capacity:public ' + data_dict['fq'] if include_drafts: data_dict['fq'] += ' +state:(active OR draft)' # Pop these ones as Solr does not need them extras = data_dict.pop('extras', None) # enforce permission filter based on user if context.get('ignore_auth') or (user and authz.is_sysadmin(user)): labels = None else: labels = lib_plugins.get_permission_labels( ).get_user_dataset_labels(context['auth_user_obj']) query = PackageSearchQuery() query.run(data_dict, permission_labels=labels) # Add them back so extensions can use them on after_search data_dict['extras'] = extras if result_fl and not extras.get('fl_compatible', False): for package in query.results: if package.get('extras'): package.update(package['extras']) package.pop('extras') results.append(package) else: for package in query.results: # get the package object package_dict = package.get(data_source) ## use data in search index if there if package_dict: # the package_dict still needs translating when being viewed package_dict = json.loads(package_dict) if context.get('for_view'): for item in plugins.PluginImplementations( plugins.IPackageController): package_dict = item.before_view(package_dict) results.append(package_dict) else: log.error( 'No package_dict is coming from solr for package ' 'id %s', package['id']) count = query.count facets = query.facets raw_solr_results = { 'results': query.results, 'highlighting': query.highlighting, 'count': query.count, 'facets': query.facets, } else: count = 0 facets = {} results = [] raw_solr_results = {} search_results = { 'count': count, 'facets': facets, 'results': results, 'sort': data_dict['sort'], } include_raw_solr_results = False for item in plugins.PluginImplementations(plugins.IPackageController): if 'include_raw_solr_results' in dir(item): include_raw_solr_results = include_raw_solr_results \ or item.include_raw_solr_results(data_dict) if include_raw_solr_results: search_results['raw_solr_results'] = raw_solr_results # create a lookup table of group name to title for all the groups and # organizations in the current search's facets. group_names = [] for field_name in ('groups', 'organization'): group_names.extend(facets.get(field_name, {}).keys()) groups = (session.query(model.Group.name, model.Group.title).filter( model.Group.name.in_(group_names)).all() if group_names else []) group_titles_by_name = dict(groups) # Transform facets into a more useful data structure. restructured_facets = {} for key, value in facets.items(): restructured_facets[key] = {'title': key, 'items': []} for key_, value_ in value.items(): new_facet_dict = {} new_facet_dict['name'] = key_ if key in ('groups', 'organization'): display_name = group_titles_by_name.get(key_, key_) display_name = display_name if display_name and display_name.strip( ) else key_ new_facet_dict['display_name'] = display_name elif key == 'license_id': license = model.Package.get_license_register().get(key_) if license: new_facet_dict['display_name'] = license.title else: new_facet_dict['display_name'] = key_ else: new_facet_dict['display_name'] = key_ new_facet_dict['count'] = value_ restructured_facets[key]['items'].append(new_facet_dict) search_results['search_facets'] = restructured_facets # check if some extension needs to modify the search results for item in plugins.PluginImplementations(plugins.IPackageController): search_results = item.after_search(search_results, data_dict) # After extensions have had a chance to modify the facets, sort them by # display name. for facet in search_results['search_facets']: search_results['search_facets'][facet]['items'] = sorted( search_results['search_facets'][facet]['items'], key=lambda facet: facet['display_name'], reverse=True) return search_results
def index_package(self, pkg_dict, defer_commit=False): if pkg_dict is None: return # tracking summary values will be stale, never store them tracking_summary = pkg_dict.pop('tracking_summary', None) for r in pkg_dict.get('resources', []): r.pop('tracking_summary', None) data_dict_json = json.dumps(pkg_dict) if config.get('ckan.cache_validated_datasets', True): package_plugin = lib_plugins.lookup_package_plugin( pkg_dict.get('type')) schema = package_plugin.show_package_schema() validated_pkg_dict, errors = lib_plugins.plugin_validate( package_plugin, {'model': model, 'session': model.Session}, pkg_dict, schema, 'package_show') pkg_dict['validated_data_dict'] = json.dumps(validated_pkg_dict, cls=ckan.lib.navl.dictization_functions.MissingNullEncoder) pkg_dict['data_dict'] = data_dict_json # add to string field for sorting title = pkg_dict.get('title') if title: pkg_dict['title_string'] = title # delete the package if there is no state, or the state is `deleted` if (not pkg_dict.get('state') or 'deleted' in pkg_dict.get('state')): # mark as deleted instead of deleting it pkg_dict['state'] = 'deleted' #return self.delete_package(pkg_dict) index_fields = RESERVED_FIELDS + pkg_dict.keys() # include the extras in the main namespace extras = pkg_dict.get('extras', []) for extra in extras: key, value = extra['key'], extra['value'] if isinstance(value, (tuple, list)): value = " ".join(map(text_type, value)) key = ''.join([c for c in key if c in KEY_CHARS]) pkg_dict['extras_' + key] = value if key not in index_fields: pkg_dict[key] = value pkg_dict.pop('extras', None) # add tags, removing vocab tags from 'tags' list and adding them as # vocab_<tag name> so that they can be used in facets non_vocab_tag_names = [] tags = pkg_dict.pop('tags', []) context = {'model': model} for tag in tags: if tag.get('vocabulary_id'): data = {'id': tag['vocabulary_id']} vocab = logic.get_action('vocabulary_show')(context, data) key = u'vocab_%s' % vocab['name'] if key in pkg_dict: pkg_dict[key].append(tag['name']) else: pkg_dict[key] = [tag['name']] else: non_vocab_tag_names.append(tag['name']) pkg_dict['tags'] = non_vocab_tag_names # add groups groups = pkg_dict.pop('groups', []) # we use the capacity to make things private in the search index if pkg_dict['private']: pkg_dict['capacity'] = 'private' else: pkg_dict['capacity'] = 'public' pkg_dict['groups'] = [group['name'] for group in groups] # if there is an owner_org we want to add this to groups for index # purposes if pkg_dict.get('organization'): pkg_dict['organization'] = pkg_dict['organization']['name'] else: pkg_dict['organization'] = None # tracking if not tracking_summary: tracking_summary = model.TrackingSummary.get_for_package( pkg_dict['id']) pkg_dict['views_total'] = tracking_summary['total'] pkg_dict['views_recent'] = tracking_summary['recent'] resource_fields = [('name', 'res_name'), ('description', 'res_description'), ('format', 'res_format'), ('url', 'res_url'), ('resource_type', 'res_type')] resource_extras = [(e, 'res_extras_' + e) for e in model.Resource.get_extra_columns()] # flatten the structure for indexing: for resource in pkg_dict.get('resources', []): for (okey, nkey) in resource_fields + resource_extras: pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')] pkg_dict.pop('resources', None) rel_dict = collections.defaultdict(list) subjects = pkg_dict.pop("relationships_as_subject", []) objects = pkg_dict.pop("relationships_as_object", []) for rel in objects: type = model.PackageRelationship.forward_to_reverse_type(rel['type']) rel_dict[type].append(model.Package.get(rel['subject_package_id']).name) for rel in subjects: type = rel['type'] rel_dict[type].append(model.Package.get(rel['object_package_id']).name) for key, value in rel_dict.iteritems(): if key not in pkg_dict: pkg_dict[key] = value pkg_dict[TYPE_FIELD] = PACKAGE_TYPE # Save dataset type pkg_dict['dataset_type'] = pkg_dict['type'] # clean the dict fixing keys and dates # FIXME where are we getting these dirty keys from? can we not just # fix them in the correct place or is this something that always will # be needed? For my data not changing the keys seems to not cause a # problem. new_dict = {} bogus_date = datetime.datetime(1, 1, 1) for key, value in pkg_dict.items(): key = key.encode('ascii', 'ignore') if key.endswith('_date'): try: date = parse(value, default=bogus_date) if date != bogus_date: value = date.isoformat() + 'Z' else: # The date field was empty, so dateutil filled it with # the default bogus date value = None except ValueError: continue new_dict[key] = value pkg_dict = new_dict for k in ('title', 'notes', 'title_string'): if k in pkg_dict and pkg_dict[k]: pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k]) # modify dates (SOLR is quite picky with dates, and only accepts ISO dates # with UTC time (i.e trailing Z) # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html pkg_dict['metadata_created'] += 'Z' pkg_dict['metadata_modified'] += 'Z' # mark this CKAN instance as data source: pkg_dict['site_id'] = config.get('ckan.site_id') # Strip a selection of the fields. # These fields are possible candidates for sorting search results on, # so we strip leading spaces because solr will sort " " before "a" or "A". for field_name in ['title']: try: value = pkg_dict.get(field_name) if value: pkg_dict[field_name] = value.lstrip() except KeyError: pass # add a unique index_id to avoid conflicts import hashlib pkg_dict['index_id'] = hashlib.md5('%s%s' % (pkg_dict['id'],config.get('ckan.site_id'))).hexdigest() for item in PluginImplementations(IPackageController): pkg_dict = item.before_index(pkg_dict) assert pkg_dict, 'Plugin must return non empty package dict on index' # permission labels determine visibility in search, can't be set # in original dataset or before_index plugins labels = lib_plugins.get_permission_labels() dataset = model.Package.get(pkg_dict['id']) pkg_dict['permission_labels'] = labels.get_dataset_labels( dataset) if dataset else [] # TestPackageSearchIndex-workaround # send to solr: try: conn = make_connection() commit = not defer_commit if not asbool(config.get('ckan.search.solr_commit', 'true')): commit = False conn.add(docs=[pkg_dict], commit=commit) except pysolr.SolrError as e: msg = 'Solr returned an error: {0}'.format( e[:1000] # limit huge responses ) raise SearchIndexError(msg) except socket.error as e: err = 'Could not connect to Solr using {0}: {1}'.format(conn.url, str(e)) log.error(err) raise SearchIndexError(err) commit_debug_msg = 'Not committed yet' if defer_commit else 'Committed' log.debug('Updated index for %s [%s]' % (pkg_dict.get('name'), commit_debug_msg))