def testProcess(self): SW = Stopwords() s = unicode('der die das mondauto foobar gehen gut und überhaupt', 'iso-8859-15') res = SW.process(s.split(' '), 'de') self.assertEqual(res, [u'mondauto', u'foobar', u'gehen', u'gut', unicode('überhaupt', 'iso-8859-15')]) res = SW.process(s.split(' '), 'en') self.assertEqual(res, list(s.split(' ')))
def testStopwords(self): SW = Stopwords() en_words = SW.stopwordsForLanguage('en') for w in en_words: self.assertEqual(type(w), unicode) de_words = SW.stopwordsForLanguage('de') for w in de_words: self.assertEqual(type(w), unicode) self.assertEqual(len(SW.stopwordsForLanguage('xx')), 0)
def testProcess(self): SW = Stopwords() s = unicode('der die das mondauto foobar gehen gut und überhaupt', 'iso-8859-15') res = SW.process(s.split(' '), 'de') self.assertEqual(res, [ u'mondauto', u'foobar', u'gehen', u'gut', unicode('überhaupt', 'iso-8859-15') ]) res = SW.process(s.split(' '), 'en') self.assertEqual(res, list(s.split(' ')))
def setUp(self): setUp() provideUtility(PDFConverter, IConverter, name='application/pdf') provideUtility(SplitterFactory, IFactory, name='txng.splitters.default') provideUtility(SimpleSplitterFactory, IFactory, name='txng.splitters.simple') provideUtility(EnglishParser(), IParser, name='txng.parsers.en') provideUtility(LexiconFactory, IFactory, name='txng.lexicons.default') provideUtility(StorageFactory, IFactory, name='txng.storages.default') provideUtility(Stopwords(), IStopwords) provideUtility(Normalizer(), INormalizer)
def testStopwordReader(self): SW = Stopwords() for lang in SW.availableLanguages(): words = SW.stopwordsForLanguage(lang)
def testAvailableLanguages(self): lst = Stopwords().availableLanguages() assert 'en' in lst assert 'fr' in lst assert not 'xx' in lst