Ejemplo n.º 1
0
Archivo: index.py Proyecto: Quoin/ckan
def clear_index():
    conn = make_connection()
    query = "+site_id:\"%s\"" % (config.get('ckan.site_id'))
    try:
        conn.delete(q=query)
    except socket.error as e:
        err = 'Could not connect to SOLR %r: %r' % (conn.url, e)
        log.error(err)
        raise SearchIndexError(err)
    except pysolr.SolrError as e:
        err = 'SOLR %r exception: %r' % (conn.url, e)
        log.error(err)
        raise SearchIndexError(err)
Ejemplo n.º 2
0
 def commit(self):
     try:
         conn = make_connection()
         conn.commit(wait_searcher=False)
     except Exception, e:
         log.exception(e)
         raise SearchIndexError(e)
Ejemplo n.º 3
0
Archivo: index.py Proyecto: Quoin/ckan
 def delete_package(self, pkg_dict):
     conn = make_connection()
     query = "+%s:%s AND +(id:\"%s\" OR name:\"%s\") AND +site_id:\"%s\"" % \
             (TYPE_FIELD, PACKAGE_TYPE, pkg_dict.get('id'), pkg_dict.get('id'), config.get('ckan.site_id'))
     try:
         commit = asbool(config.get('ckan.search.solr_commit', 'true'))
         conn.delete(q=query, commit=commit)
     except Exception as e:
         log.exception(e)
         raise SearchIndexError(e)
Ejemplo n.º 4
0
def clear_index():
    import solr.core
    conn = make_connection()
    query = "+site_id:\"%s\"" % (config.get('ckan.site_id'))
    try:
        conn.delete_query(query)
        conn.commit()
    except socket.error, e:
        err = 'Could not connect to SOLR %r: %r' % (conn.url, e)
        log.error(err)
        raise SearchIndexError(err)
Ejemplo n.º 5
0
 def delete_package(self, pkg_dict):
     conn = make_connection()
     query = "+%s:%s (+id:\"%s\" OR +name:\"%s\") +site_id:\"%s\"" % (
         TYPE_FIELD, PACKAGE_TYPE, pkg_dict.get('id'), pkg_dict.get('id'),
         config.get('ckan.site_id'))
     try:
         conn.delete_query(query)
         conn.commit()
     except Exception, e:
         log.exception(e)
         raise SearchIndexError(e)
Ejemplo n.º 6
0
    def index_package(self, pkg_dict, defer_commit=False):
        if pkg_dict is None:
            return
        pkg_dict['data_dict'] = json.dumps(pkg_dict)

        # add to string field for sorting
        title = pkg_dict.get('title')
        if title:
            pkg_dict['title_string'] = title

        if (not pkg_dict.get('state')) or ('active'
                                           not in pkg_dict.get('state')):
            return self.delete_package(pkg_dict)

        index_fields = RESERVED_FIELDS + pkg_dict.keys()

        # include the extras in the main namespace
        extras = pkg_dict.get('extras', [])
        for extra in extras:
            key, value = extra['key'], json.loads(extra['value'])
            if isinstance(value, (tuple, list)):
                value = " ".join(map(unicode, value))
            key = ''.join([c for c in key if c in KEY_CHARS])
            pkg_dict['extras_' + key] = value
            if key not in index_fields:
                pkg_dict[key] = value
        pkg_dict.pop('extras', None)

        # add tags, removing vocab tags from 'tags' list and adding them as
        # vocab_<tag name> so that they can be used in facets
        non_vocab_tag_names = []
        tags = pkg_dict.pop('tags', [])
        context = {'model': model}

        for tag in tags:
            if tag.get('vocabulary_id'):
                data = {'id': tag['vocabulary_id']}
                vocab = logic.get_action('vocabulary_show')(context, data)
                key = u'vocab_%s' % vocab['name']
                if key in pkg_dict:
                    pkg_dict[key].append(tag['name'])
                else:
                    pkg_dict[key] = [tag['name']]
            else:
                non_vocab_tag_names.append(tag['name'])

        pkg_dict['tags'] = non_vocab_tag_names

        # add groups
        groups = pkg_dict.pop('groups', [])

        # we use the capacity to make things private in the search index
        if pkg_dict['private']:
            pkg_dict['capacity'] = 'private'
        else:
            pkg_dict['capacity'] = 'public'

        pkg_dict['groups'] = [group['name'] for group in groups]

        # if there is an owner_org we want to add this to groups for index
        # purposes
        if pkg_dict['owner_org']:
            pkg_dict['groups'].append(pkg_dict['organization']['name'])

        # tracking
        tracking_summary = pkg_dict.pop('tracking_summary', None)
        if tracking_summary:
            pkg_dict['views_total'] = tracking_summary['total']
            pkg_dict['views_recent'] = tracking_summary['recent']

        # flatten the structure for indexing:
        for resource in pkg_dict.get('resources', []):
            for (okey, nkey) in [('description', 'res_description'),
                                 ('format', 'res_format'), ('url', 'res_url')]:
                pkg_dict[nkey] = pkg_dict.get(nkey,
                                              []) + [resource.get(okey, u'')]
        pkg_dict.pop('resources', None)

        rel_dict = collections.defaultdict(list)
        subjects = pkg_dict.pop("relationships_as_subject", [])
        objects = pkg_dict.pop("relationships_as_object", [])
        for rel in objects:
            type = model.PackageRelationship.forward_to_reverse_type(
                rel['type'])
            rel_dict[type].append(
                model.Package.get(rel['subject_package_id']).name)
        for rel in subjects:
            type = rel['type']
            rel_dict[type].append(
                model.Package.get(rel['object_package_id']).name)
        for key, value in rel_dict.iteritems():
            if key not in pkg_dict:
                pkg_dict[key] = value

        pkg_dict[TYPE_FIELD] = PACKAGE_TYPE

        pkg_dict = dict([(k.encode('ascii', 'ignore'), v)
                         for (k, v) in pkg_dict.items()])

        for k in ('title', 'notes', 'title_string'):
            if k in pkg_dict and pkg_dict[k]:
                pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k])

        # modify dates (SOLR is quite picky with dates, and only accepts ISO dates
        # with UTC time (i.e trailing Z)
        # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html
        pkg_dict['metadata_created'] += 'Z'
        pkg_dict['metadata_modified'] += 'Z'

        # mark this CKAN instance as data source:
        pkg_dict['site_id'] = config.get('ckan.site_id')

        # Strip a selection of the fields.
        # These fields are possible candidates for sorting search results on,
        # so we strip leading spaces because solr will sort " " before "a" or "A".
        for field_name in ['title']:
            try:
                value = pkg_dict.get(field_name)
                if value:
                    pkg_dict[field_name] = value.lstrip()
            except KeyError:
                pass

        # add a unique index_id to avoid conflicts
        import hashlib
        pkg_dict['index_id'] = hashlib.md5(
            '%s%s' % (pkg_dict['id'], config.get('ckan.site_id'))).hexdigest()

        for item in PluginImplementations(IPackageController):
            pkg_dict = item.before_index(pkg_dict)

        assert pkg_dict, 'Plugin must return non empty package dict on index'

        # send to solr:
        try:
            conn = make_connection()
            commit = not defer_commit
            if not asbool(config.get('ckan.search.solr_commit', 'true')):
                commit = False
            conn.add_many([pkg_dict], _commit=commit)
        except Exception, e:
            log.exception(e)
            raise SearchIndexError(e)
Ejemplo n.º 7
0
def clear_index():
    import solr.core
    conn = make_connection()
    query = "+site_id:\"%s\"" % (config.get('ckan.site_id'))
    try:
        conn.delete_query(query)
        conn.commit()
    except socket.error, e:
        err = 'Could not connect to SOLR %r: %r' % (conn.url, e)
        log.error(err)
        raise SearchIndexError(err)
    except solr.core.SolrException, e:
        err = 'SOLR %r exception: %r' % (conn.url, e)
        log.error(err)
        raise SearchIndexError(err)
    finally:
        conn.close()


class SearchIndex(object):
    """
    A search index handles the management of documents of a specific type in the
    index, but no queries.
    The default implementation maps many of the methods, so most subclasses will
    only have to implement ``update_dict`` and ``remove_dict``.
    """
    def __init__(self):
        pass

    def insert_dict(self, data):
Ejemplo n.º 8
0
class PackageSearchIndex(SearchIndex):
    def remove_dict(self, pkg_dict):
        self.delete_package(pkg_dict)

    def update_dict(self, pkg_dict, defer_commit=False):
        self.index_package(pkg_dict, defer_commit)

    def index_package(self, pkg_dict, defer_commit=False):
        if pkg_dict is None:
            return

        # tracking summary values will be stale, never store them
        tracking_summary = pkg_dict.pop('tracking_summary', None)
        for r in pkg_dict.get('resources', []):
            r.pop('tracking_summary', None)

        data_dict_json = json.dumps(pkg_dict)

        if config.get('ckan.cache_validated_datasets', True):
            package_plugin = lib_plugins.lookup_package_plugin(
                pkg_dict.get('type'))

            schema = package_plugin.show_package_schema()
            validated_pkg_dict, errors = lib_plugins.plugin_validate(
                package_plugin, {'model': model, 'session': model.Session},
                pkg_dict, schema, 'package_show')
            pkg_dict['validated_data_dict'] = json.dumps(validated_pkg_dict,
                cls=ckan.lib.navl.dictization_functions.MissingNullEncoder)

        pkg_dict['data_dict'] = data_dict_json

        # add to string field for sorting
        title = pkg_dict.get('title')
        if title:
            pkg_dict['title_string'] = title

        # delete the package if there is no state, or the state is `deleted`
        if (not pkg_dict.get('state') or 'deleted' in pkg_dict.get('state')):
            return self.delete_package(pkg_dict)

        index_fields = RESERVED_FIELDS + pkg_dict.keys()

        # include the extras in the main namespace
        extras = pkg_dict.get('extras', [])
        for extra in extras:
            key, value = extra['key'], extra['value']
            if isinstance(value, (tuple, list)):
                value = " ".join(map(unicode, value))
            key = ''.join([c for c in key if c in KEY_CHARS])
            pkg_dict['extras_' + key] = value
            if key not in index_fields:
                pkg_dict[key] = value
        pkg_dict.pop('extras', None)

        # add tags, removing vocab tags from 'tags' list and adding them as
        # vocab_<tag name> so that they can be used in facets
        non_vocab_tag_names = []
        tags = pkg_dict.pop('tags', [])
        context = {'model': model}

        for tag in tags:
            if tag.get('vocabulary_id'):
                data = {'id': tag['vocabulary_id']}
                vocab = logic.get_action('vocabulary_show')(context, data)
                key = u'vocab_%s' % vocab['name']
                if key in pkg_dict:
                    pkg_dict[key].append(tag['name'])
                else:
                    pkg_dict[key] = [tag['name']]
            else:
                non_vocab_tag_names.append(tag['name'])

        pkg_dict['tags'] = non_vocab_tag_names

        # add groups
        groups = pkg_dict.pop('groups', [])

        # we use the capacity to make things private in the search index
        if pkg_dict['private']:
            pkg_dict['capacity'] = 'private'
        else:
            pkg_dict['capacity'] = 'public'

        pkg_dict['groups'] = [group['name'] for group in groups]

        # if there is an owner_org we want to add this to groups for index
        # purposes
        if pkg_dict.get('organization'):
           pkg_dict['organization'] = pkg_dict['organization']['name']
        else:
           pkg_dict['organization'] = None

        # tracking
        if not tracking_summary:
            tracking_summary = model.TrackingSummary.get_for_package(
                pkg_dict['id'])
        pkg_dict['views_total'] = tracking_summary['total']
        pkg_dict['views_recent'] = tracking_summary['recent']

        resource_fields = [('name', 'res_name'),
                           ('description', 'res_description'),
                           ('format', 'res_format'),
                           ('url', 'res_url'),
                           ('resource_type', 'res_type')]
        resource_extras = [(e, 'res_extras_' + e) for e
                            in model.Resource.get_extra_columns()]
        # flatten the structure for indexing:
        for resource in pkg_dict.get('resources', []):
            for (okey, nkey) in resource_fields + resource_extras:
                pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')]
        pkg_dict.pop('resources', None)

        rel_dict = collections.defaultdict(list)
        subjects = pkg_dict.pop("relationships_as_subject", [])
        objects = pkg_dict.pop("relationships_as_object", [])
        for rel in objects:
            type = model.PackageRelationship.forward_to_reverse_type(rel['type'])
            rel_dict[type].append(model.Package.get(rel['subject_package_id']).name)
        for rel in subjects:
            type = rel['type']
            rel_dict[type].append(model.Package.get(rel['object_package_id']).name)
        for key, value in rel_dict.iteritems():
            if key not in pkg_dict:
                pkg_dict[key] = value

        pkg_dict[TYPE_FIELD] = PACKAGE_TYPE

        # Save dataset type
        pkg_dict['dataset_type'] = pkg_dict['type']

        # clean the dict fixing keys and dates
        # FIXME where are we getting these dirty keys from?  can we not just
        # fix them in the correct place or is this something that always will
        # be needed?  For my data not changing the keys seems to not cause a
        # problem.
        new_dict = {}
        bogus_date = datetime.datetime(1, 1, 1)
        for key, value in pkg_dict.items():
            key = key.encode('ascii', 'ignore')
            if key.endswith('_date'):
                try:
                    date = parse(value, default=bogus_date)
                    if date != bogus_date:
                        value = date.isoformat() + 'Z'
                    else:
                        # The date field was empty, so dateutil filled it with
                        # the default bogus date
                        value = None
                except ValueError:
                    continue
            new_dict[key] = value
        pkg_dict = new_dict

        for k in ('title', 'notes', 'title_string'):
            if k in pkg_dict and pkg_dict[k]:
                pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k])

        # modify dates (SOLR is quite picky with dates, and only accepts ISO dates
        # with UTC time (i.e trailing Z)
        # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html
        pkg_dict['metadata_created'] += 'Z'
        pkg_dict['metadata_modified'] += 'Z'

        # mark this CKAN instance as data source:
        pkg_dict['site_id'] = config.get('ckan.site_id')

        # Strip a selection of the fields.
        # These fields are possible candidates for sorting search results on,
        # so we strip leading spaces because solr will sort " " before "a" or "A".
        for field_name in ['title']:
            try:
                value = pkg_dict.get(field_name)
                if value:
                    pkg_dict[field_name] = value.lstrip()
            except KeyError:
                pass

        # add a unique index_id to avoid conflicts
        import hashlib
        pkg_dict['index_id'] = hashlib.md5('%s%s' % (pkg_dict['id'],config.get('ckan.site_id'))).hexdigest()

        for item in PluginImplementations(IPackageController):
            pkg_dict = item.before_index(pkg_dict)

        assert pkg_dict, 'Plugin must return non empty package dict on index'

        # send to solr:
        try:
            conn = make_connection()
            commit = not defer_commit
            if not asbool(config.get('ckan.search.solr_commit', 'true')):
                commit = False
            conn.add(docs=[pkg_dict], commit=commit)
        except pysolr.SolrError, e:
            msg = 'Solr returned an error: {0}'.format(
                e[:1000] # limit huge responses
            )
            raise SearchIndexError(msg)
        except socket.error, e:
            err = 'Could not connect to Solr using {0}: {1}'.format(conn.url, str(e))
            log.error(err)
            raise SearchIndexError(err)
Ejemplo n.º 9
0
    def index_package(self, pkg_dict):
        if pkg_dict is None:
            return

        if (not pkg_dict.get('state')) or ('active'
                                           not in pkg_dict.get('state')):
            return self.delete_package(pkg_dict)

        index_fields = RESERVED_FIELDS + pkg_dict.keys()

        # include the extras in the main namespace
        extras = pkg_dict.get('extras', [])
        for extra in extras:
            key, value = extra['key'], json.loads(extra['value'])
            if isinstance(value, (tuple, list)):
                value = " ".join(map(unicode, value))
            key = ''.join([c for c in key if c in KEY_CHARS])
            pkg_dict['extras_' + key] = value
            if key not in index_fields:
                pkg_dict[key] = value
        pkg_dict.pop('extras', None)

        #Add tags and groups
        tags = pkg_dict.pop('tags', [])
        pkg_dict['tags'] = [tag['name'] for tag in tags]

        groups = pkg_dict.pop('groups', [])

        # Capacity is different to the default only if using organizations
        # where the dataset is only in one group. We will add the capacity
        # from the single group that it is a part of if we have a group
        if len(groups):
            pkg_dict['capacity'] = groups[0].get('capacity', 'public')

        pkg_dict['groups'] = [group['name'] for group in groups]

        # flatten the structure for indexing:
        for resource in pkg_dict.get('resources', []):
            for (okey, nkey) in [('description', 'res_description'),
                                 ('format', 'res_format'), ('url', 'res_url')]:
                pkg_dict[nkey] = pkg_dict.get(nkey,
                                              []) + [resource.get(okey, u'')]
        pkg_dict.pop('resources', None)

        rel_dict = collections.defaultdict(list)
        subjects = pkg_dict.pop("relationships_as_subject", [])
        objects = pkg_dict.pop("relationships_as_object", [])
        for rel in objects:
            type = model.PackageRelationship.forward_to_reverse_type(
                rel['type'])
            rel_dict[type].append(
                model.Package.get(rel['subject_package_id']).name)
        for rel in subjects:
            type = rel['type']
            rel_dict[type].append(
                model.Package.get(rel['object_package_id']).name)
        for key, value in rel_dict.iteritems():
            if key not in pkg_dict:
                pkg_dict[key] = value

        pkg_dict[TYPE_FIELD] = PACKAGE_TYPE

        pkg_dict = dict([(k.encode('ascii', 'ignore'), v)
                         for (k, v) in pkg_dict.items()])

        for k in ('title', 'notes'):
            if k in pkg_dict and pkg_dict[k]:
                pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k])

        # modify dates (SOLR is quite picky with dates, and only accepts ISO dates
        # with UTC time (i.e trailing Z)
        # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html
        pkg_dict['metadata_created'] += 'Z'
        pkg_dict['metadata_modified'] += 'Z'

        # mark this CKAN instance as data source:
        pkg_dict['site_id'] = config.get('ckan.site_id')

        # Strip a selection of the fields.
        # These fields are possible candidates for sorting search results on,
        # so we strip leading spaces because solr will sort " " before "a" or "A".
        for field_name in ['title']:
            try:
                value = pkg_dict.get(field_name)
                if value:
                    pkg_dict[field_name] = value.lstrip()
            except KeyError:
                pass

        # add a unique index_id to avoid conflicts
        import hashlib
        pkg_dict['index_id'] = hashlib.md5(
            '%s%s' % (pkg_dict['id'], config.get('ckan.site_id'))).hexdigest()

        for item in PluginImplementations(IPackageController):
            pkg_dict = item.before_index(pkg_dict)

        assert pkg_dict, 'Plugin must return non empty package dict on index'

        # send to solr:
        try:
            conn = make_connection()
            conn.add_many([pkg_dict])
            conn.commit(wait_flush=False, wait_searcher=False)
        except Exception, e:
            log.exception(e)
            raise SearchIndexError(e)
Ejemplo n.º 10
0
    def index_package(self, pkg_dict):
        if pkg_dict is None:
            return

        if (not pkg_dict.get('state')) or ('active'
                                           not in pkg_dict.get('state')):
            return self.delete_package(pkg_dict)

        conn = make_connection()
        index_fields = RESERVED_FIELDS + pkg_dict.keys()

        # include the extras in the main namespace
        extras = pkg_dict.get('extras', {})
        for (key, value) in extras.items():
            if isinstance(value, (tuple, list)):
                value = " ".join(map(unicode, value))
            key = ''.join([c for c in key if c in KEY_CHARS])
            pkg_dict['extras_' + key] = value
            if key not in index_fields:
                pkg_dict[key] = value
        if 'extras' in pkg_dict:
            del pkg_dict['extras']

        # flatten the structure for indexing:
        for resource in pkg_dict.get('resources', []):
            for (okey, nkey) in [('description', 'res_description'),
                                 ('format', 'res_format'), ('url', 'res_url')]:
                pkg_dict[nkey] = pkg_dict.get(nkey,
                                              []) + [resource.get(okey, u'')]
        if 'resources' in pkg_dict:
            del pkg_dict['resources']

        # index relationships as <type>:<object>
        rel_dict = {}
        rel_types = list(itertools.chain(RELATIONSHIP_TYPES))
        for rel in pkg_dict.get('relationships', []):
            _type = rel.get('type', 'rel')
            if (_type in pkg_dict.keys()) or (_type not in rel_types):
                continue
            rel_dict[_type] = rel_dict.get(_type, []) + [rel.get('object')]

        pkg_dict.update(rel_dict)

        if 'relationships' in pkg_dict:
            del pkg_dict['relationships']

        pkg_dict[TYPE_FIELD] = PACKAGE_TYPE
        pkg_dict = dict([(k.encode('ascii', 'ignore'), v)
                         for (k, v) in pkg_dict.items()])

        # modify dates (SOLR is quite picky with dates, and only accepts ISO dates
        # with UTC time (i.e trailing Z)
        # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html
        pkg_dict['metadata_created'] += 'Z'
        pkg_dict['metadata_modified'] += 'Z'

        # mark this CKAN instance as data source:
        pkg_dict['site_id'] = config.get('ckan.site_id')

        # add a unique index_id to avoid conflicts
        import hashlib
        pkg_dict['index_id'] = hashlib.md5(
            '%s%s' % (pkg_dict['id'], config.get('ckan.site_id'))).hexdigest()

        for item in PluginImplementations(IPackageController):
            pkg_dict = item.before_index(pkg_dict)

        assert pkg_dict, 'Plugin must return non empty package dict on index'

        # send to solr:
        try:
            conn.add_many([pkg_dict])
            conn.commit(wait_flush=False, wait_searcher=False)
        except Exception, e:
            log.exception(e)
            raise SearchIndexError(e)