Example #1
0
    def _process_words(self, content, language):
        """ implements the processing pipeline """

        # first normalize content string
        if self.use_normalizer:
            normalizer = getUtility(INormalizer)
            content = normalizer.process(content, language)

        # now create a new splitter
        splitter = createObject(
            self.splitter,
            casefolding=self.splitter_casefolding,
            separator=self.splitter_additional_chars,
            maxlen=self.splitter_max_length,
        )

        # and split unicode content into list of unicode strings
        words = splitter.split(content)

        # now filter out all stopwords
        if self.use_stopwords:
            sw_utility = getUtility(IStopwords)
            words = sw_utility.process(words, language)

        # Stem words if required. If no stemmer for 'language' is available
        # then do not stem
        if self.use_stemmer:
            S = getStemmer(language)
            if S:
                words = S.stem(words)

        return words
Example #2
0
    def _process_words(self, content, language):
        """ implements the processing pipeline """

        # first normalize content string
        if self.use_normalizer:
            normalizer = getUtility(INormalizer)
            content = normalizer.process(content, language)

        # now create a new splitter
        splitter = createObject(self.splitter,
                                casefolding=self.splitter_casefolding,
                                separator=self.splitter_additional_chars,
                                maxlen=self.splitter_max_length,
                               )

        # and split unicode content into list of unicode strings
        words = splitter.split(content)

        # now filter out all stopwords
        if self.use_stopwords:
            sw_utility = getUtility(IStopwords)
            words = sw_utility.process(words, language)

        # Stem words if required. If no stemmer for 'language' is available
        # then do not stem
        if self.use_stemmer:
            S = getStemmer(language)
            if S:
                words = S.stem(words)

        return words
def lookup_word(SR, word, field):
    index = SR.getIndex()
    lexicon = index.getLexicon()

    if index.use_stemmer:
        # Stemmer support only works with disabled autoexpansion
        S = getStemmer(SR.language)
        if S:
            word = S.stem([word])[0]

        wordid = lexicon.getWordId(word, SR.language)
        if SR.autoexpand != 'off':
            raise ValueError('auto expansion is only available without enabled stemmer support')
        _words, _wids = [word], [wordid]

    else:

        wordid = lexicon.getWordId(word, SR.language)

        # perform autoexpansion only if the length of the given term is longer or
        # equal to the autoexpand_limit configuration parameter of the index

        if (SR.autoexpand=='always' or (SR.autoexpand=='on_miss' and not wordid)) \
            and len(word) >= index.autoexpand_limit:
            # lookup all words with 'word' as prefix
            words = list(lexicon.getWordsForRightTruncation(word, SR.language))

            # obtain wordids for words
            wids = lexicon.getWordIds(words, SR.language)

            # add the original word and wordid
            wids.append(wordid)
            words.append(word)
            _words, _wids = words, wids
        else:
            _words, _wids = [word], [wordid]

    # Thesaurus handling: check if thesaurus is set to a list of configured
    # thesauruses. If yes, perform a lookup for every word and enrich the 
    # resultset

    if SR.thesaurus:
        for word in _words[:]:
            for id in SR.thesaurus:
                import zope.component
                from zopyx.txng3.core.interfaces import IThesaurus

                TH = zope.component.queryUtility(IThesaurus, id)
                if TH is None:
                    raise ValueError('No thesaurus "%s" configured' % id)

                related_terms = TH.getTermsFor(word)
                if related_terms:
                    _words.extend(related_terms)
                    wids = lexicon.getWordIds(related_terms, SR.language)
                    _wids.extend(wids)

    return ResultSet(index.getStorage(field).getDocumentsForWordIds(_wids),  [(w, field) for w in _words])
def lookup_by_phrase(SR, docids, words, field):
    index = SR.getIndex()
    lexicon = index.getLexicon()
    storage = index.getStorage(field)

    if index.use_stemmer:
        S = getStemmer(SR.language)
        if S:
            words = S.stem(words)

    wids = lexicon.getWordIds(words, SR.language)
    docids = [docid for docid in docids if storage.hasContigousWordids(docid, wids)]
    return ResultSet(DocidList(docids), [(w, field) for w in words])
Example #5
0
def lookup_by_phrase(SR, docids, words, field):
    index = SR.getIndex()
    lexicon = index.getLexicon()
    storage = index.getStorage(field)

    if index.use_stemmer:
        S = getStemmer(SR.language)
        if S:
            words = S.stem(words)

    wids = lexicon.getWordIds(words, SR.language)
    docids = [
        docid for docid in docids if storage.hasContigousWordids(docid, wids)
    ]
    return ResultSet(DocidList(docids), [(w, field) for w in words])
Example #6
0
def lookup_word(SR, word, field):
    index = SR.getIndex()
    lexicon = index.getLexicon()

    if index.use_stemmer:
        # Stemmer support only works with disabled autoexpansion
        S = getStemmer(SR.language)
        if S:
            word = S.stem([word])[0]

        wordid = lexicon.getWordId(word, SR.language)
        if SR.autoexpand != 'off':
            raise ValueError(
                'auto expansion is only available without enabled stemmer support'
            )
        _words, _wids = [word], [wordid]

    else:

        wordid = lexicon.getWordId(word, SR.language)

        # perform autoexpansion only if the length of the given term is longer or
        # equal to the autoexpand_limit configuration parameter of the index

        if (SR.autoexpand=='always' or (SR.autoexpand=='on_miss' and not wordid)) \
            and len(word) >= index.autoexpand_limit:
            # lookup all words with 'word' as prefix
            words = list(lexicon.getWordsForRightTruncation(word, SR.language))

            # obtain wordids for words
            wids = lexicon.getWordIds(words, SR.language)

            # add the original word and wordid
            wids.append(wordid)
            words.append(word)
            _words, _wids = words, wids
        else:
            _words, _wids = [word], [wordid]

    # Thesaurus handling: check if thesaurus is set to a list of configured
    # thesauruses. If yes, perform a lookup for every word and enrich the
    # resultset

    if SR.thesaurus:
        for word in _words[:]:
            for id in SR.thesaurus:
                import zope.component
                from zopyx.txng3.core.interfaces import IThesaurus

                TH = zope.component.queryUtility(IThesaurus, id)
                if TH is None:
                    raise ValueError('No thesaurus "%s" configured' % id)

                related_terms = TH.getTermsFor(word)
                if related_terms:
                    _words.extend(related_terms)
                    wids = lexicon.getWordIds(related_terms, SR.language)
                    _wids.extend(wids)

    return ResultSet(
        index.getStorage(field).getDocumentsForWordIds(_wids),
        [(w, field) for w in _words])