def _process_words(self, content, language): """ implements the processing pipeline """ # first normalize content string if self.use_normalizer: normalizer = getUtility(INormalizer) content = normalizer.process(content, language) # now create a new splitter splitter = createObject( self.splitter, casefolding=self.splitter_casefolding, separator=self.splitter_additional_chars, maxlen=self.splitter_max_length, ) # and split unicode content into list of unicode strings words = splitter.split(content) # now filter out all stopwords if self.use_stopwords: sw_utility = getUtility(IStopwords) words = sw_utility.process(words, language) # Stem words if required. If no stemmer for 'language' is available # then do not stem if self.use_stemmer: S = getStemmer(language) if S: words = S.stem(words) return words
def _process_words(self, content, language): """ implements the processing pipeline """ # first normalize content string if self.use_normalizer: normalizer = getUtility(INormalizer) content = normalizer.process(content, language) # now create a new splitter splitter = createObject(self.splitter, casefolding=self.splitter_casefolding, separator=self.splitter_additional_chars, maxlen=self.splitter_max_length, ) # and split unicode content into list of unicode strings words = splitter.split(content) # now filter out all stopwords if self.use_stopwords: sw_utility = getUtility(IStopwords) words = sw_utility.process(words, language) # Stem words if required. If no stemmer for 'language' is available # then do not stem if self.use_stemmer: S = getStemmer(language) if S: words = S.stem(words) return words
def lookup_word(SR, word, field): index = SR.getIndex() lexicon = index.getLexicon() if index.use_stemmer: # Stemmer support only works with disabled autoexpansion S = getStemmer(SR.language) if S: word = S.stem([word])[0] wordid = lexicon.getWordId(word, SR.language) if SR.autoexpand != 'off': raise ValueError('auto expansion is only available without enabled stemmer support') _words, _wids = [word], [wordid] else: wordid = lexicon.getWordId(word, SR.language) # perform autoexpansion only if the length of the given term is longer or # equal to the autoexpand_limit configuration parameter of the index if (SR.autoexpand=='always' or (SR.autoexpand=='on_miss' and not wordid)) \ and len(word) >= index.autoexpand_limit: # lookup all words with 'word' as prefix words = list(lexicon.getWordsForRightTruncation(word, SR.language)) # obtain wordids for words wids = lexicon.getWordIds(words, SR.language) # add the original word and wordid wids.append(wordid) words.append(word) _words, _wids = words, wids else: _words, _wids = [word], [wordid] # Thesaurus handling: check if thesaurus is set to a list of configured # thesauruses. If yes, perform a lookup for every word and enrich the # resultset if SR.thesaurus: for word in _words[:]: for id in SR.thesaurus: import zope.component from zopyx.txng3.core.interfaces import IThesaurus TH = zope.component.queryUtility(IThesaurus, id) if TH is None: raise ValueError('No thesaurus "%s" configured' % id) related_terms = TH.getTermsFor(word) if related_terms: _words.extend(related_terms) wids = lexicon.getWordIds(related_terms, SR.language) _wids.extend(wids) return ResultSet(index.getStorage(field).getDocumentsForWordIds(_wids), [(w, field) for w in _words])
def lookup_by_phrase(SR, docids, words, field): index = SR.getIndex() lexicon = index.getLexicon() storage = index.getStorage(field) if index.use_stemmer: S = getStemmer(SR.language) if S: words = S.stem(words) wids = lexicon.getWordIds(words, SR.language) docids = [docid for docid in docids if storage.hasContigousWordids(docid, wids)] return ResultSet(DocidList(docids), [(w, field) for w in words])
def lookup_by_phrase(SR, docids, words, field): index = SR.getIndex() lexicon = index.getLexicon() storage = index.getStorage(field) if index.use_stemmer: S = getStemmer(SR.language) if S: words = S.stem(words) wids = lexicon.getWordIds(words, SR.language) docids = [ docid for docid in docids if storage.hasContigousWordids(docid, wids) ] return ResultSet(DocidList(docids), [(w, field) for w in words])
def lookup_word(SR, word, field): index = SR.getIndex() lexicon = index.getLexicon() if index.use_stemmer: # Stemmer support only works with disabled autoexpansion S = getStemmer(SR.language) if S: word = S.stem([word])[0] wordid = lexicon.getWordId(word, SR.language) if SR.autoexpand != 'off': raise ValueError( 'auto expansion is only available without enabled stemmer support' ) _words, _wids = [word], [wordid] else: wordid = lexicon.getWordId(word, SR.language) # perform autoexpansion only if the length of the given term is longer or # equal to the autoexpand_limit configuration parameter of the index if (SR.autoexpand=='always' or (SR.autoexpand=='on_miss' and not wordid)) \ and len(word) >= index.autoexpand_limit: # lookup all words with 'word' as prefix words = list(lexicon.getWordsForRightTruncation(word, SR.language)) # obtain wordids for words wids = lexicon.getWordIds(words, SR.language) # add the original word and wordid wids.append(wordid) words.append(word) _words, _wids = words, wids else: _words, _wids = [word], [wordid] # Thesaurus handling: check if thesaurus is set to a list of configured # thesauruses. If yes, perform a lookup for every word and enrich the # resultset if SR.thesaurus: for word in _words[:]: for id in SR.thesaurus: import zope.component from zopyx.txng3.core.interfaces import IThesaurus TH = zope.component.queryUtility(IThesaurus, id) if TH is None: raise ValueError('No thesaurus "%s" configured' % id) related_terms = TH.getTermsFor(word) if related_terms: _words.extend(related_terms) wids = lexicon.getWordIds(related_terms, SR.language) _wids.extend(wids) return ResultSet( index.getStorage(field).getDocumentsForWordIds(_wids), [(w, field) for w in _words])