def createComponents(self, fieldName): source = StandardTokenizer() filter1 = LowerCaseFilter(source) filter1 = PorterStemFilter(filter1) filter1 = StopFilter(filter1, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET) return self.TokenStreamComponents(source, filter1)
def createComponents(self, fieldName): source = StandardTokenizer() stream = LowerCaseFilter(source) stream = StopFilter(stream, PortugueseAnalyzer.getDefaultStopSet()) stream = PortugueseLightStemFilter(stream) return self.TokenStreamComponents(source, stream)
def testStopWords(self): try: result = StandardTokenizer() result.setReader(self.reader) result = StopFilter(result, self.stop_set) except Exception, e: self.fail(str(e))
def createComponents(self, fieldName): source = StandardTokenizer() result = EnglishPossessiveFilter(source) result = LowerCaseFilter(result) result = DiacriticFilter(result) result = StopFilter(result, self.stopwords) if self.stemExclusionSet.isEmpty() is False: result = SetKeywordMarkerFilter(result, self.stemExclusionSet) result = PorterStemFilter(result) return Analyzer.TokenStreamComponents(source, result)
def testStopList(self): stopWords = ["good", "test", "analyzer"] stopWordsSet = StopFilter.makeStopSet(stopWords) newStop = StopAnalyzer(stopWordsSet) reader = StringReader("This is a good test of the english stop analyzer") stream = newStop.tokenStream("test", reader) self.assert_(stream is not None) stream.reset() termAtt = stream.getAttribute(CharTermAttribute.class_) while stream.incrementToken(): text = termAtt.toString() self.assert_(text not in stopWordsSet)
def testStopListPositions(self): stopWords = ["good", "test", "analyzer"] stopWordsSet = StopFilter.makeStopSet(stopWords) newStop = StopAnalyzer(stopWordsSet) reader = StringReader("This is a good test of the english stop analyzer with positions") expectedIncr = [ 1, 1, 1, 3, 1, 1, 1, 2, 1] stream = newStop.tokenStream("test", reader) self.assert_(stream is not None) stream.reset() i = 0 termAtt = stream.getAttribute(CharTermAttribute.class_) posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class_) while stream.incrementToken(): text = termAtt.toString() self.assert_(text not in stopWordsSet) self.assertEqual(expectedIncr[i], posIncrAtt.getPositionIncrement()) i += 1
def setUp(self): stopWords = ['the', 'and', 's'] self.stop_set = StopFilter.makeStopSet(stopWords) self.reader = StringReader('foo')
def filter_stopwords(stream): stream = StopFilter(stream, RomanianAnalyzer.getDefaultStopSet()) with open('romanian-stopwords.txt') as f: additional_stopwords = unicode(f.read(), 'utf-8').split() additional_stopwords = StopFilter.makeStopSet(additional_stopwords) return StopFilter(stream, additional_stopwords)