def OrNode(self, node): sets = [self(n) for n in node.getValue()] return unionResultSets(sets)
def search(self, query, **kw): """ Perform a query against the index. Valid query options are: 'parser' -- named utility implementing IParser 'language' -- language to be used to lookup words from the lexicon 'field' -- perform searches against a configured index field 'autoexpand' -- off|always|on_miss (see below) """ # queries must be unicode if not isinstance(query, unicode): raise ValueError('Query must be unicode string') # First check query options for k in kw.keys(): if not k in self.query_options: raise ValueError( 'Unknown option: %s (supported query options: %s)' % (k, ', '.join(self.query_options))) # obtain parser ID (which is the name of named utility implementing IParser) parser_id = kw.get('parser', self.query_parser) # determine query language language = kw.get('language', self.languages[0]) if not language in self.languages: raise ValueError( 'Unsupported language: %s (supported languages: %s)' % (language, ', '.join(self.languages))) # check if field is known to the index field = kw.get('field') search_all_fields = kw.get('search_all_fields') if field and search_all_fields: raise ValueError('Cannot specify field and search_all_fields') if search_all_fields: if not self.dedicated_storage: raise ValueError( 'search_all_fields cannot be used without dedicated ' 'storage.') search_fields = self.fields else: if not field: field = self.fields[0] if field not in self.fields: raise ValueError('Unknown field: %s (known fields: %s)' % (field, ', '.join(self.fields))) search_fields = [field] # perform optional cosine ranking after searching ranking = bool(kw.get('ranking', self.ranking)) if ranking and not self._feature_ranking: raise ValueError( "The storage used for this index does not support relevance ranking" ) # Limit *ranked* result set to at most XXX hits ranking_maxhits = kw.get('ranking_maxhits', 50) if not isinstance(ranking_maxhits, int): raise ValueError('"ranking_maxhits" must be an integer') if kw.has_key('ranking_maxhits') and not ranking: raise ValueError( 'Specify "ranking_maxhits" only with having set ranking=True') # autoexpansion of query terms # 'off' -- expand never # 'always' -- expand always # 'on_miss' -- expand only for not-found terms in the query string autoexpand = kw.get('autoexpand', self.autoexpand) if not autoexpand in ('off', 'always', 'on_miss'): raise ValueError( '"autoexpand" must either be "off", "always" or "on_miss"') # Use a sequence of configured thesauri (identified by their configured name) # for additional lookup of terms thesaurus = kw.get('thesaurus', []) if isinstance(thesaurus, str): thesaurus = (thesaurus, ) if not isinstance(thesaurus, (list, tuple)): raise ValueError( '"thesaurus" must be list or tuple of configured thesaurus ids' ) # Similarity ratio (measured as Levenshtein distance) similarity_ratio = float(kw.get('similarity_ratio', 0.75)) if similarity_ratio < 0.0 or similarity_ratio > 1.0: raise ValueError( 'similarity_ratio must been 0.0 and 1.0 (value %f)' % similarity_ratio) # obtain a parser (registered as named utility) parser = getUtility(IParser, parser_id) # run query string through normalizer, case normalizer etc. query = self._prepare_query(query, language) # create a tree of nodes parsed_query = parser.parse(query) if not parsed_query: raise ValueError('No query specified') # Post-filter for stopwords. We need to perform this # outside the query parser because the lex/yacc-based query # parser implementation can't be used in a reasonable way # to deal with such additional functionality. if self.use_stopwords: sw_utility = getUtility(IStopwords) stopwords = sw_utility.stopwordsForLanguage(language) if stopwords: # The stopword remover removes WordNodes representing # a stopword *in-place* stopword_remover(parsed_query, stopwords) # Split word nodes with the splitter splitter = createObject( self.splitter, casefolding=self.splitter_casefolding, separator=self.splitter_additional_chars, maxlen=self.splitter_max_length, ) parsed_query = node_splitter(parsed_query, splitter) # build an instance for the search resultsets = [] for field in search_fields: sr = SearchRequest(self, query=query, parsetree=parsed_query, field=field, autoexpand=autoexpand, similarity_ratio=similarity_ratio, thesaurus=thesaurus, language=language) # call the evaluator and produce a ResultSet instance resultsets.append(Evaluator(sr).run()) resultset = unionResultSets(resultsets) # optional ranking using the cosine measure or another configure # ranking method if ranking: ranking_method = getUtility(IRanking, name=self.ranking_method) resultset.ranking(ranking_method, index=self, language=language, nbest=ranking_maxhits) return resultset
def search(self, query, **kw): """ Perform a query against the index. Valid query options are: 'parser' -- named utility implementing IParser 'language' -- language to be used to lookup words from the lexicon 'field' -- perform searches against a configured index field 'autoexpand' -- off|always|on_miss (see below) """ # queries must be unicode if not isinstance(query, unicode): raise ValueError('Query must be unicode string') # First check query options for k in kw.keys(): if not k in self.query_options: raise ValueError('Unknown option: %s (supported query options: %s)' % (k, ', '.join(self.query_options))) # obtain parser ID (which is the name of named utility implementing IParser) parser_id = kw.get('parser', self.query_parser) # determine query language language = kw.get('language', self.languages[0]) if not language in self.languages: raise ValueError('Unsupported language: %s (supported languages: %s)' % (language, ', '.join(self.languages))) # check if field is known to the index field = kw.get('field') search_all_fields = kw.get('search_all_fields') if field and search_all_fields: raise ValueError('Cannot specify field and search_all_fields') if search_all_fields: if not self.dedicated_storage: raise ValueError( 'search_all_fields cannot be used without dedicated ' 'storage.') search_fields = self.fields else: if not field: field = self.fields[0] if field not in self.fields: raise ValueError('Unknown field: %s (known fields: %s)' % ( field, ', '.join(self.fields))) search_fields = [field] # perform optional cosine ranking after searching ranking = bool(kw.get('ranking', self.ranking)) if ranking and not self._feature_ranking: raise ValueError("The storage used for this index does not support relevance ranking") # Limit *ranked* result set to at most XXX hits ranking_maxhits = kw.get('ranking_maxhits', 50) if not isinstance(ranking_maxhits, int): raise ValueError('"ranking_maxhits" must be an integer') if kw.has_key('ranking_maxhits') and not ranking: raise ValueError('Specify "ranking_maxhits" only with having set ranking=True') # autoexpansion of query terms # 'off' -- expand never # 'always' -- expand always # 'on_miss' -- expand only for not-found terms in the query string autoexpand = kw.get('autoexpand', self.autoexpand) if not autoexpand in ('off', 'always', 'on_miss'): raise ValueError('"autoexpand" must either be "off", "always" or "on_miss"') # Use a sequence of configured thesauri (identified by their configured name) # for additional lookup of terms thesaurus = kw.get('thesaurus', []) if isinstance(thesaurus, str): thesaurus = (thesaurus,) if not isinstance(thesaurus, (list, tuple)): raise ValueError('"thesaurus" must be list or tuple of configured thesaurus ids') # Similarity ratio (measured as Levenshtein distance) similarity_ratio = float(kw.get('similarity_ratio', 0.75)) if similarity_ratio < 0.0 or similarity_ratio > 1.0: raise ValueError('similarity_ratio must been 0.0 and 1.0 (value %f)' % similarity_ratio) # obtain a parser (registered as named utility) parser = getUtility(IParser, parser_id) # run query string through normalizer, case normalizer etc. query = self._prepare_query(query, language) # create a tree of nodes parsed_query = parser.parse(query) if not parsed_query: raise ValueError('No query specified') # Post-filter for stopwords. We need to perform this # outside the query parser because the lex/yacc-based query # parser implementation can't be used in a reasonable way # to deal with such additional functionality. if self.use_stopwords: sw_utility = getUtility(IStopwords) stopwords = sw_utility.stopwordsForLanguage(language) if stopwords: # The stopword remover removes WordNodes representing # a stopword *in-place* stopword_remover(parsed_query, stopwords) # Split word nodes with the splitter splitter = createObject(self.splitter, casefolding=self.splitter_casefolding, separator=self.splitter_additional_chars, maxlen=self.splitter_max_length, ) parsed_query = node_splitter(parsed_query, splitter) # build an instance for the search resultsets = [] for field in search_fields: sr = SearchRequest(self, query=query, parsetree=parsed_query, field=field, autoexpand=autoexpand, similarity_ratio=similarity_ratio, thesaurus=thesaurus, language=language) # call the evaluator and produce a ResultSet instance resultsets.append(Evaluator(sr).run()) resultset = unionResultSets(resultsets) # optional ranking using the cosine measure or another configure # ranking method if ranking: ranking_method = getUtility(IRanking, name=self.ranking_method) resultset.ranking(ranking_method, index=self, language=language, nbest=ranking_maxhits) return resultset