Ejemplo n.º 1
0
 def OrNode(self, node):
     sets = [self(n) for n in node.getValue()]
     return unionResultSets(sets) 
Ejemplo n.º 2
0
    def search(self, query, **kw):
        """ Perform a query against the index. Valid query options are:

            'parser' -- named utility implementing IParser
            'language' -- language to be used to lookup words from the lexicon
            'field' -- perform searches against a configured index field
            'autoexpand' -- off|always|on_miss (see below)
        """

        # queries must be unicode
        if not isinstance(query, unicode):
            raise ValueError('Query must be unicode string')

        # First check query options
        for k in kw.keys():
            if not k in self.query_options:
                raise ValueError(
                    'Unknown option: %s (supported query options: %s)' %
                    (k, ', '.join(self.query_options)))

        # obtain parser ID (which is the name of named utility implementing IParser)
        parser_id = kw.get('parser', self.query_parser)

        # determine query language
        language = kw.get('language', self.languages[0])
        if not language in self.languages:
            raise ValueError(
                'Unsupported language: %s (supported languages: %s)' %
                (language, ', '.join(self.languages)))

        # check if field is known to the index
        field = kw.get('field')
        search_all_fields = kw.get('search_all_fields')
        if field and search_all_fields:
            raise ValueError('Cannot specify field and search_all_fields')
        if search_all_fields:
            if not self.dedicated_storage:
                raise ValueError(
                    'search_all_fields cannot be used without dedicated '
                    'storage.')
            search_fields = self.fields
        else:
            if not field:
                field = self.fields[0]
            if field not in self.fields:
                raise ValueError('Unknown field: %s (known fields: %s)' %
                                 (field, ', '.join(self.fields)))
            search_fields = [field]

        # perform optional cosine ranking after searching
        ranking = bool(kw.get('ranking', self.ranking))
        if ranking and not self._feature_ranking:
            raise ValueError(
                "The storage used for this index does not support relevance ranking"
            )

        # Limit *ranked* result set to at most XXX hits
        ranking_maxhits = kw.get('ranking_maxhits', 50)
        if not isinstance(ranking_maxhits, int):
            raise ValueError('"ranking_maxhits" must be an integer')
        if kw.has_key('ranking_maxhits') and not ranking:
            raise ValueError(
                'Specify "ranking_maxhits" only with having set ranking=True')

        # autoexpansion of query terms
        # 'off' -- expand never
        # 'always' -- expand always
        # 'on_miss' -- expand only for not-found terms in the query string
        autoexpand = kw.get('autoexpand', self.autoexpand)
        if not autoexpand in ('off', 'always', 'on_miss'):
            raise ValueError(
                '"autoexpand" must either be "off", "always" or "on_miss"')

        # Use a sequence of configured thesauri (identified by their configured name)
        # for additional lookup of terms
        thesaurus = kw.get('thesaurus', [])
        if isinstance(thesaurus, str):
            thesaurus = (thesaurus, )
        if not isinstance(thesaurus, (list, tuple)):
            raise ValueError(
                '"thesaurus" must be list or tuple of configured thesaurus ids'
            )

        # Similarity ratio (measured as Levenshtein distance)
        similarity_ratio = float(kw.get('similarity_ratio', 0.75))
        if similarity_ratio < 0.0 or similarity_ratio > 1.0:
            raise ValueError(
                'similarity_ratio must been 0.0 and 1.0 (value %f)' %
                similarity_ratio)

        # obtain a parser (registered  as named utility)
        parser = getUtility(IParser, parser_id)

        # run query string through normalizer, case normalizer etc.
        query = self._prepare_query(query, language)

        # create a tree of nodes
        parsed_query = parser.parse(query)

        if not parsed_query:
            raise ValueError('No query specified')

        # Post-filter for stopwords. We need to perform this
        # outside the query parser because the lex/yacc-based query
        # parser implementation can't be used in a reasonable way
        # to deal with such additional functionality.

        if self.use_stopwords:
            sw_utility = getUtility(IStopwords)
            stopwords = sw_utility.stopwordsForLanguage(language)

            if stopwords:
                # The stopword remover removes WordNodes representing
                # a stopword *in-place*
                stopword_remover(parsed_query, stopwords)

        # Split word nodes with the splitter
        splitter = createObject(
            self.splitter,
            casefolding=self.splitter_casefolding,
            separator=self.splitter_additional_chars,
            maxlen=self.splitter_max_length,
        )
        parsed_query = node_splitter(parsed_query, splitter)

        # build an instance for the search
        resultsets = []
        for field in search_fields:
            sr = SearchRequest(self,
                               query=query,
                               parsetree=parsed_query,
                               field=field,
                               autoexpand=autoexpand,
                               similarity_ratio=similarity_ratio,
                               thesaurus=thesaurus,
                               language=language)

            # call the evaluator and produce a ResultSet instance
            resultsets.append(Evaluator(sr).run())
        resultset = unionResultSets(resultsets)

        # optional ranking using the cosine measure or another configure
        # ranking method
        if ranking:
            ranking_method = getUtility(IRanking, name=self.ranking_method)
            resultset.ranking(ranking_method,
                              index=self,
                              language=language,
                              nbest=ranking_maxhits)

        return resultset
Ejemplo n.º 3
0
 def OrNode(self, node):
     sets = [self(n) for n in node.getValue()]
     return unionResultSets(sets)
Ejemplo n.º 4
0
    def search(self, query, **kw):
        """ Perform a query against the index. Valid query options are:

            'parser' -- named utility implementing IParser
            'language' -- language to be used to lookup words from the lexicon
            'field' -- perform searches against a configured index field
            'autoexpand' -- off|always|on_miss (see below)
        """

        # queries must be unicode
        if not isinstance(query, unicode):
            raise ValueError('Query must be unicode string')

        # First check query options
        for k in kw.keys():
            if not k in self.query_options:
                raise ValueError('Unknown option: %s (supported query options: %s)' % (k, ', '.join(self.query_options)))

        # obtain parser ID (which is the name of named utility implementing IParser)
        parser_id = kw.get('parser', self.query_parser)

        # determine query language
        language = kw.get('language', self.languages[0])
        if not language in self.languages:
            raise ValueError('Unsupported language: %s (supported languages: %s)' % (language, ', '.join(self.languages)))

        # check if field is known to the index
        field = kw.get('field')
        search_all_fields = kw.get('search_all_fields')
        if field and search_all_fields:
            raise ValueError('Cannot specify field and search_all_fields')
        if search_all_fields:
            if not self.dedicated_storage:
                raise ValueError(
                    'search_all_fields cannot be used without dedicated '
                    'storage.')
            search_fields = self.fields
        else:
            if not field:
                field = self.fields[0]
            if field not in self.fields:
                raise ValueError('Unknown field: %s (known fields: %s)' % (
                    field, ', '.join(self.fields)))
            search_fields = [field]

        # perform optional cosine ranking after searching
        ranking = bool(kw.get('ranking', self.ranking))
        if ranking and not self._feature_ranking:
            raise ValueError("The storage used for this index does not support relevance ranking")

        # Limit *ranked* result set to at most XXX hits
        ranking_maxhits = kw.get('ranking_maxhits', 50)
        if not isinstance(ranking_maxhits, int):
            raise ValueError('"ranking_maxhits" must be an integer')
        if kw.has_key('ranking_maxhits') and not ranking:
            raise ValueError('Specify "ranking_maxhits" only with having set ranking=True')

        # autoexpansion of query terms
        # 'off' -- expand never
        # 'always' -- expand always
        # 'on_miss' -- expand only for not-found terms in the query string
        autoexpand = kw.get('autoexpand', self.autoexpand)
        if not autoexpand in ('off', 'always', 'on_miss'):
            raise ValueError('"autoexpand" must either be "off", "always" or "on_miss"')

        # Use a sequence of configured thesauri (identified by their configured name)
        # for additional lookup of terms
        thesaurus = kw.get('thesaurus', [])
        if isinstance(thesaurus, str):
            thesaurus = (thesaurus,)
        if not isinstance(thesaurus, (list, tuple)):
            raise ValueError('"thesaurus" must be list or tuple of configured thesaurus ids')

        # Similarity ratio (measured as Levenshtein distance)
        similarity_ratio = float(kw.get('similarity_ratio', 0.75))
        if similarity_ratio < 0.0 or similarity_ratio > 1.0:
            raise ValueError('similarity_ratio must been 0.0 and 1.0 (value %f)' % similarity_ratio)

        # obtain a parser (registered  as named utility)
        parser = getUtility(IParser, parser_id)

        # run query string through normalizer, case normalizer etc.
        query = self._prepare_query(query, language)

        # create a tree of nodes
        parsed_query = parser.parse(query)

        if not parsed_query:
            raise ValueError('No query specified')

        # Post-filter for stopwords. We need to perform this
        # outside the query parser because the lex/yacc-based query
        # parser implementation can't be used in a reasonable way
        # to deal with such additional functionality.

        if self.use_stopwords:
            sw_utility = getUtility(IStopwords)
            stopwords = sw_utility.stopwordsForLanguage(language)

            if stopwords:
                # The stopword remover removes WordNodes representing
                # a stopword *in-place*
                stopword_remover(parsed_query, stopwords)

        # Split word nodes with the splitter
        splitter = createObject(self.splitter,
                                casefolding=self.splitter_casefolding,
                                separator=self.splitter_additional_chars,
                                maxlen=self.splitter_max_length,
                               )
        parsed_query = node_splitter(parsed_query, splitter)


        # build an instance for the search
        resultsets = []
        for field in search_fields:
            sr = SearchRequest(self,
                               query=query,
                               parsetree=parsed_query,
                               field=field,
                               autoexpand=autoexpand,
                               similarity_ratio=similarity_ratio,
                               thesaurus=thesaurus,
                               language=language)

            # call the evaluator and produce a ResultSet instance
            resultsets.append(Evaluator(sr).run())
        resultset = unionResultSets(resultsets)

        # optional ranking using the cosine measure or another configure
        # ranking method
        if ranking:
            ranking_method = getUtility(IRanking, name=self.ranking_method)
            resultset.ranking(ranking_method,
                              index=self,
                              language=language,
                              nbest=ranking_maxhits)

        return resultset