Example #1
0
    def createComponents(self, fieldName):
        source = StandardTokenizer()
        filter1 = LowerCaseFilter(source)
        filter1 = PorterStemFilter(filter1)
        filter1 = StopFilter(filter1, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)

        return self.TokenStreamComponents(source, filter1)
Example #2
0
	def createComponents(self, fieldName):
		source = StandardTokenizer()
		stream = LowerCaseFilter(source)
		stream = StopFilter(stream, PortugueseAnalyzer.getDefaultStopSet())
		stream = PortugueseLightStemFilter(stream)

		return self.TokenStreamComponents(source, stream)
Example #3
0
    def testStopWords(self):

        try:
            result = StandardTokenizer()
            result.setReader(self.reader)
            result = StopFilter(result, self.stop_set)
        except Exception, e:
            self.fail(str(e))
 def createComponents(self, fieldName):
     source = StandardTokenizer()
     result = EnglishPossessiveFilter(source)
     result = LowerCaseFilter(result)
     result = DiacriticFilter(result)
     result = StopFilter(result, self.stopwords)
     if self.stemExclusionSet.isEmpty() is False:
         result = SetKeywordMarkerFilter(result, self.stemExclusionSet)
     result = PorterStemFilter(result)
     return Analyzer.TokenStreamComponents(source, result)
Example #5
0
    def testStopList(self):

        stopWords = ["good", "test", "analyzer"]
        stopWordsSet = StopFilter.makeStopSet(stopWords)

        newStop = StopAnalyzer(stopWordsSet)
        reader = StringReader("This is a good test of the english stop analyzer")
        stream = newStop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        termAtt = stream.getAttribute(CharTermAttribute.class_)

        while stream.incrementToken():
            text = termAtt.toString()
            self.assert_(text not in stopWordsSet)
Example #6
0
    def testStopListPositions(self):

        stopWords = ["good", "test", "analyzer"]
        stopWordsSet = StopFilter.makeStopSet(stopWords)

        newStop = StopAnalyzer(stopWordsSet)
        reader = StringReader("This is a good test of the english stop analyzer with positions")
        expectedIncr = [ 1,   1, 1,          3, 1,  1,      1,            2,   1]
        stream = newStop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        i = 0
        termAtt = stream.getAttribute(CharTermAttribute.class_)
        posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class_)

        while stream.incrementToken():
            text = termAtt.toString()
            self.assert_(text not in stopWordsSet)
            self.assertEqual(expectedIncr[i],
                             posIncrAtt.getPositionIncrement())
            i += 1
Example #7
0
    def setUp(self):

        stopWords = ['the', 'and', 's']
        self.stop_set = StopFilter.makeStopSet(stopWords)
        self.reader = StringReader('foo')
Example #8
0
    def setUp(self):

        stopWords = ['the', 'and', 's']
        self.stop_set = StopFilter.makeStopSet(stopWords)
        self.reader = StringReader('foo')
 def filter_stopwords(stream):
     stream = StopFilter(stream, RomanianAnalyzer.getDefaultStopSet())
     with open('romanian-stopwords.txt') as f:
         additional_stopwords = unicode(f.read(), 'utf-8').split()
     additional_stopwords = StopFilter.makeStopSet(additional_stopwords)
     return StopFilter(stream, additional_stopwords)