Ejemplo n.º 1
0
Archivo: query.py Proyecto: sipf/ckan
    def get_index(self, reference):
        query = {
            'rows': 1,
            'q': 'name:"%s" OR id:"%s"' % (reference, reference),
            'wt': 'json',
            'fq': 'site_id:"%s"' % config.get('ckan.site_id')
        }

        try:
            if query['q'].startswith('{!'):
                raise SearchError('Local parameters are not supported.')
        except KeyError:
            pass

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError as e:
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e))

        if solr_response.hits == 0:
            raise SearchError('Dataset not found in the search index: %s' %
                              reference)
        else:
            return solr_response.docs[0]
Ejemplo n.º 2
0
def check_solr_schema_version(schema_file=None):
    '''
        Checks if the schema version of the SOLR server is compatible
        with this CKAN version.

        The schema will be retrieved from the SOLR server, using the
        offset defined in SOLR_SCHEMA_FILE_OFFSET_MANAGED
        ('/schema?wt=schema.xml'). If SOLR is set to use the manually
        edited `schema.xml`, the schema will be retrieved from the SOLR
        server using the offset defined in
        SOLR_SCHEMA_FILE_OFFSET_CLASSIC ('/admin/file/?file=schema.xml').

        The schema_file parameter allows to override this pointing to
        different schema file, but it should only be used for testing
        purposes.

        If the CKAN instance is configured to not use SOLR or the SOLR
        server is not available, the function will return False, as the
        version check does not apply. If the SOLR server is available,
        a SearchError exception will be thrown if the version could not
        be extracted or it is not included in the supported versions list.

        :schema_file: Absolute path to an alternative schema file. Should
                      be only used for testing purposes (Default is None)
    '''

    if not is_available():
        # Something is wrong with the SOLR server
        log.warn('Problems were found while connecting to the SOLR server')
        return False

    # Try to get the schema XML file to extract the version
    if not schema_file:
        try:
            # Try Managed Schema
            res = _get_schema_from_solr(SOLR_SCHEMA_FILE_OFFSET_MANAGED)
            res.raise_for_status()
        except requests.HTTPError:
            # Fallback to Manually Edited schema.xml
            res = _get_schema_from_solr(SOLR_SCHEMA_FILE_OFFSET_CLASSIC)
        schema_content = res.text
    else:
        with open(schema_file, 'rb') as f:
            schema_content = f.read()

    tree = xml.dom.minidom.parseString(schema_content)

    version = tree.documentElement.getAttribute('version')
    if not len(version):
        msg = 'Could not extract version info from the SOLR schema'
        if schema_file:
            msg += ', using file {}'.format(schema_file)
        raise SearchError(msg)

    if not version in SUPPORTED_SCHEMA_VERSIONS:
        raise SearchError('SOLR schema version not supported: %s. Supported'
                          ' versions are [%s]'
                          % (version, ', '.join(SUPPORTED_SCHEMA_VERSIONS)))
    return True
Ejemplo n.º 3
0
class PackageSearchQuery(SearchQuery):
    def get_all_entity_ids(self, max_results=1000):
        """
        Return a list of the IDs of all indexed packages.
        """
        query = "*:*"
        fq = "+site_id:\"%s\" " % config.get('ckan.site_id')
        fq += "+state:active "

        conn = make_connection()
        try:
            data = conn.query(query, fq=fq, rows=max_results, fields='id')
        finally:
            conn.close()

        return [r.get('id') for r in data.results]

    def get_index(self, reference):
        '''
        For a given package reference (ID or name), returns the record for it
        from the SOLR index.
        '''
        query = {
            'rows': 1,
            'q': 'name:%s OR id:%s' % (reference, reference),
            'wt': 'json',
            'fq': 'site_id:"%s"' % config.get('ckan.site_id')
        }

        conn = make_connection()
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e.reason))
        try:
            data = json.loads(solr_response)

            if data['response']['numFound'] == 0:
                raise SearchError('Dataset not found in the search index: %s' %
                                  reference)
            else:
                return data['response']['docs'][0]
        except Exception, e:
            if not isinstance(e, SearchError):
                log.exception(e)
            raise SearchError(e)
Ejemplo n.º 4
0
class PackageSearchQuery(SearchQuery):
    def get_all_entity_ids(self, max_results=1000):
        """
        Return a list of the IDs of all indexed packages.
        """
        query = "*:*"
        fq = "+site_id:\"%s\" " % config.get('ckan.site_id')
        fq += "+state:active "

        conn = make_connection()
        data = conn.search(query, fq=fq, rows=max_results, fields='id')
        return [r.get('id') for r in data.docs]

    def get_index(self, reference):
        query = {
            'rows': 1,
            'q': 'name:"%s" OR id:"%s"' % (reference, reference),
            'wt': 'json',
            'fq': 'site_id:"%s"' % config.get('ckan.site_id')
        }

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError, e:
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e))

        if solr_response.hits == 0:
            raise SearchError('Dataset not found in the search index: %s' %
                              reference)
        else:
            return solr_response.docs[0]
Ejemplo n.º 5
0
 def run(self,
         query=None,
         terms=[],
         fields={},
         facet_by=[],
         options=None,
         **kwargs):
     raise SearchError("SearchQuery.run() not implemented!")
Ejemplo n.º 6
0
def query_for(_type):
    """ Get a SearchQuery instance sub-class suitable for the specified
        type. """
    try:
        _type_n = _normalize_type(_type)
        return _QUERIES[_type_n]()
    except KeyError as ke:
        raise SearchError("Unknown search type: %s" % _type)
Ejemplo n.º 7
0
Archivo: query.py Proyecto: frafra/ckan
 def run(self,
         query: Optional[Union[str, dict[str, Any]]] = None,
         terms: Optional[list[str]] = None,
         fields: Optional[dict[str, Any]] = None,
         facet_by: Optional[list[str]] = None,
         options: Optional[QueryOptions] = None,
         **kwargs: Any) -> NoReturn:
     raise SearchError("SearchQuery.run() not implemented!")
Ejemplo n.º 8
0
    def get_index(self,reference):
        query = {
            'rows': 1,
            'q': 'name:"%s" OR id:"%s"' % (reference,reference),
            'wt': 'json',
            'fq': 'site_id:"%s"' % config.get('ckan.site_id')}

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError, e:
            raise SearchError('SOLR returned an error running query: %r Error: %r' %
                              (query, e))
Ejemplo n.º 9
0
    def get_index(self, reference):
        query = {
            'rows': 1,
            'q': 'name:%s OR id:%s' % (reference, reference),
            'wt': 'json',
            'fq': 'site_id:"%s"' % config.get('ckan.site_id')
        }

        conn = make_connection()
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e.reason))
Ejemplo n.º 10
0
    def run(self, query):
        '''
        Performs a dataset search using the given query.

        @param query - dictionary with keys like: q, fq, sort, rows, facet
        @return - dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        '''
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [
                s for s in set(query.keys()) - VALID_SOLR_PARAMETERS
            ]
            raise SearchQueryError("Invalid search parameters: %s" %
                                   invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get('rows', 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query['rows'] = rows_to_query

        # order by score if no 'sort' term given
        order_by = query.get('sort')
        if order_by == 'rank' or order_by is None:
            query['sort'] = 'score desc, name asc'

        # show only results from this CKAN instance
        fq = query.get('fq', '')
        if not '+site_id:' in fq:
            fq += ' +site_id:"%s"' % config.get('ckan.site_id')

        # filter for package status
        if not '+state:' in fq:
            fq += " +state:active"
        query['fq'] = fq

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get(
            'facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')

        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        if ':' not in query['q']:
            query['defType'] = 'dismax'
            query['tie'] = '0.1'
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query['mm'] = '2<-1 5<80%'
            query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection()
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e.reason))
Ejemplo n.º 11
0
            # #1683 Filter out the last row that is sometimes out of order
            self.results = self.results[:rows_to_return]

            # get any extras and add to 'extras' dict
            for result in self.results:
                extra_keys = filter(lambda x: x.startswith('extras_'),
                                    result.keys())
                extras = {}
                for extra_key in extra_keys:
                    value = result.pop(extra_key)
                    extras[extra_key[len('extras_'):]] = value
                if extra_keys:
                    result['extras'] = extras

            # if just fetching the id or name, return a list instead of a dict
            if query.get('fl') in ['id', 'name']:
                self.results = [r.get(query.get('fl')) for r in self.results]

            # get facets and convert facets list to a dict
            self.facets = data.get('facet_counts', {}).get('facet_fields', {})
            for field, values in self.facets.iteritems():
                self.facets[field] = dict(zip(values[0::2], values[1::2]))
        except Exception, e:
            log.exception(e)
            raise SearchError(e)
        finally:
            conn.close()

        return {'results': self.results, 'count': self.count}
Ejemplo n.º 12
0
    def run(self, query, permission_labels=None, **kwargs):
        '''
        Performs a dataset search using the given query.

        :param query: dictionary with keys like: q, fq, sort, rows, facet
        :type query: dict
        :param permission_labels: filter results to those that include at
            least one of these labels. None to not filter (return everything)
        :type permission_labels: list of unicode strings; or None

        :returns: dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        '''
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [
                s for s in set(query.keys()) - VALID_SOLR_PARAMETERS
            ]
            raise SearchQueryError("Invalid search parameters: %s" %
                                   invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get('rows', 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query['rows'] = rows_to_query

        fq = []
        if 'fq' in query:
            fq.append(query['fq'])
        fq.extend(query.get('fq_list', []))

        # show only results from this CKAN instance
        fq.append('+site_id:%s' % solr_literal(config.get('ckan.site_id')))

        # filter for package status
        if not '+state:' in query.get('fq', ''):
            fq.append('+state:active')

        # only return things we should be able to see
        if permission_labels is not None:
            fq.append('+permission_labels:(%s)' %
                      ' OR '.join(solr_literal(p) for p in permission_labels))
        query['fq'] = fq

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get(
            'facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')

        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        defType = query.get('defType', 'dismax')
        if ':' not in query['q'] or defType == 'edismax':
            query['defType'] = defType
            query['tie'] = query.get('tie', '0.1')
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query['mm'] = query.get('mm', '2<-1 5<80%')
            query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError as e:
            # Error with the sort parameter.  You see slightly different
            # error messages depending on whether the SOLR JSON comes back
            # or Jetty gets in the way converting it to HTML - not sure why
            #
            if e.args and isinstance(e.args[0], str):
                if "Can't determine a Sort Order" in e.args[0] or \
                        "Can't determine Sort Order" in e.args[0] or \
                        'Unknown sort order' in e.args[0]:
                    raise SearchQueryError('Invalid "sort" parameter')
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e))
        self.count = solr_response.hits
        self.results = solr_response.docs

        # #1683 Filter out the last row that is sometimes out of order
        self.results = self.results[:rows_to_return]

        # get any extras and add to 'extras' dict
        for result in self.results:
            extra_keys = filter(lambda x: x.startswith('extras_'),
                                result.keys())
            extras = {}
            for extra_key in extra_keys:
                value = result.pop(extra_key)
                extras[extra_key[len('extras_'):]] = value
            if extra_keys:
                result['extras'] = extras

        # if just fetching the id or name, return a list instead of a dict
        if query.get('fl') in ['id', 'name']:
            self.results = [r.get(query.get('fl')) for r in self.results]

        # get facets and convert facets list to a dict
        self.facets = solr_response.facets.get('facet_fields', {})
        for field, values in six.iteritems(self.facets):
            self.facets[field] = dict(zip(values[0::2], values[1::2]))

        return {'results': self.results, 'count': self.count}
Ejemplo n.º 13
0
    def run(self, query, permission_labels=None, **kwargs):
        '''
        Performs a dataset search using the given query.

        :param query: dictionary with keys like: q, fq, sort, rows, facet
        :type query: dict
        :param permission_labels: filter results to those that include at
            least one of these labels. None to not filter (return everything)
        :type permission_labels: list of unicode strings; or None

        :returns: dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        '''
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        valid_solr_parameters = VALID_SOLR_PARAMETERS
        for item in plugins.PluginImplementations(plugins.IPackageController):
            if 'update_valid_solr_parameters' in dir(item):
                valid_solr_parameters = item.update_valid_solr_parameters(
                    valid_solr_parameters)

        if not set(query.keys()) <= valid_solr_parameters:
            invalid_params = [
                s for s in set(query.keys()) - valid_solr_parameters
            ]
            raise SearchQueryError("Invalid search parameters: %s" %
                                   invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get('rows', 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query['rows'] = rows_to_query

        fq = []
        if 'fq' in query:
            fq.append(query['fq'])
        fq.extend(query.get('fq_list', []))

        # show only results from this CKAN instance
        fq.append('+site_id:%s' % solr_literal(config.get('ckan.site_id')))

        # filter for package status
        if not '+state:' in query.get('fq', ''):
            fq.append('+state:active')

        # only return things we should be able to see
        if permission_labels is not None:
            fq.append('+permission_labels:(%s)' %
                      ' OR '.join(solr_literal(p) for p in permission_labels))
        query['fq'] = fq

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get(
            'facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')

        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        defType = query.get('defType', 'dismax')
        if ':' not in query['q'] or defType == 'edismax':
            query['defType'] = defType
            query['tie'] = query.get('tie', '0.1')
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query['mm'] = query.get('mm', '2<-1 5<80%')
            query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError, e:
            # Error with the sort parameter.  You see slightly different
            # error messages depending on whether the SOLR JSON comes back
            # or Jetty gets in the way converting it to HTML - not sure why
            #
            if e.args and isinstance(e.args[0], str):
                if "Can't determine a Sort Order" in e.args[0] or \
                        "Can't determine Sort Order" in e.args[0] or \
                        'Unknown sort order' in e.args[0]:
                    raise SearchQueryError('Invalid "sort" parameter')
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e))
Ejemplo n.º 14
0
    def run(self, query):
        '''
        Performs a dataset search using the given query.

        @param query - dictionary with keys like: q, fq, sort, rows, facet
        @return - dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        '''
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [
                s for s in set(query.keys()) - VALID_SOLR_PARAMETERS
            ]
            raise SearchQueryError("Invalid search parameters: %s" %
                                   invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get('rows', 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query['rows'] = rows_to_query

        # show only results from this CKAN instance
        fq = query.get('fq', '')
        if not '+site_id:' in fq:
            fq += ' +site_id:"%s"' % config.get('ckan.site_id')

        # filter for package status
        if not '+state:' in fq:
            fq += " +state:active"
        query['fq'] = [fq]

        fq_list = query.get('fq_list', [])
        query['fq'].extend(fq_list)

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get(
            'facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')

        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        defType = query.get('defType', 'dismax')
        if ':' not in query['q'] or defType == 'edismax':
            query['defType'] = defType
            query['tie'] = query.get('tie', '0.1')
            # this minimum match is explained
            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
            query['mm'] = query.get('mm', '2<-1 5<80%')
            query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection(decode_dates=False)
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.search(**query)
        except pysolr.SolrError, e:
            # Error with the sort parameter.  You see slightly different
            # error messages depending on whether the SOLR JSON comes back
            # or Jetty gets in the way converting it to HTML - not sure why
            #
            if e.args and isinstance(e.args[0], str):
                if "Can't determine a Sort Order" in e.args[0] or \
                        "Can't determine Sort Order" in e.args[0] or \
                        'Unknown sort order' in e.args[0]:
                    raise SearchQueryError('Invalid "sort" parameter')
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e))
Ejemplo n.º 15
0
def _run(query):
    '''
    Custom final preparation of the solr query
    and call to the solr api.

    :param query:
    :return:
    '''

    # default query is to return all documents
    q = query.get('q')
    if not q or q == '""' or q == "''":
        query['q'] = "*:*"

    # number of results
    rows_to_return = query.get('rows', 0)
    if rows_to_return > 0:
        # #1683 Work around problem of last result being out of order
        #       in SOLR 1.4
        rows_to_query = rows_to_return + 1
    else:
        rows_to_query = rows_to_return
    query['rows'] = rows_to_query

    # show only results from this CKAN instance
    fq = query.get('fq', '')
    if not '+site_id:' in fq:
        fq += ' +site_id:"%s"' % config.get('ckan.site_id')

    # filter for package status
    if not '+state:' in fq:
        fq += " +state:active"
    query['fq'] = [fq]

    fq_list = query.get('fq_list', [])
    query['fq'].extend(fq_list)

    # faceting
    query['facet'] = query.get('facet', 'true')
    query['facet.limit'] = query.get('facet.limit',
                                     config.get('search.facets.limit', '50'))
    query['facet.mincount'] = query.get('facet.mincount', 1)

    # return the package ID and search scores
    query['fl'] = query.get('fl', 'name')

    # return results as json encoded string
    query['wt'] = query.get('wt', 'json')

    # If the query has a colon in it then consider it a fielded search and do use dismax.
    defType = query.get('defType', 'dismax')

    boolean = query.get('extras', {}).get('ext_boolean', 'all')
    if boolean not in ['all', 'any', 'exact']:
        log.error('Ignoring unknown boolean search operator %r' % (boolean, ))
        boolean = 'all'

    if ':' not in query['q']:
        query['defType'] = 'dismax'
        query['tie'] = '0.1'
        if boolean == 'any':
            query['mm'] = '0'
        elif boolean == 'all':
            query['mm'] = '100%'
        elif boolean == 'exact':
            query['q'] = '"' + q.replace('"', '\\"') + '"'
        query['qf'] = query.get('qf', QUERY_FIELDS)

    conn = make_connection()
    log.info('Package query: %r' % query)
    try:
        start_time = time.time()
        solr_response = conn.raw_query(**query)
        duration = time.time() - start_time
        log.info("Solr returned the resilt after {0}".format(duration))
    except SolrException, e:
        raise SearchError(
            'SOLR returned an error running query: %r Error: %r' %
            (query, e.reason))