def get_stop_words(self, language, fail_safe=False): """ Returns a StopWord object initialized with the stop words collection requested by ``language``. If the requested language is not available a StopWordError is raised. If ``fail_safe`` is set to True, an empty StopWord object is returned. """ try: language = self.language_codes[language] except KeyError: pass collection = self.LOADED_LANGUAGES_CACHE.get(language) if collection is None: try: collection = self._get_stop_words(language) self.LOADED_LANGUAGES_CACHE[language] = collection except StopWordError as error: if not fail_safe: raise error collection = [] stop_words = StopWord(language, collection) return stop_words
def test_sub(self): nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw = nsw - self.sw self.assertEqual(sorted(list(nsw)), ['norf', 'qux']) self.assertEqual(nsw.language, 'bar') nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw -= self.sw self.assertEqual(sorted(list(nsw)), ['norf', 'qux']) self.assertEqual(nsw.language, 'bar') nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw -= ['tic', 'tac', 'toc', 'qux'] self.assertEqual(sorted(list(nsw)), ['baz', 'norf']) nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw -= 'baz' self.assertEqual(sorted(list(nsw)), ['norf', 'qux']) if sys.version_info[0] == 2: nsw = StopWord('bar', ['baz', 'qux', 'norf']) nsw -= 'baz'.decode('utf-8') self.assertEqual(sorted(list(nsw)), ['norf', 'qux']) nsw = StopWord('bar', ['baz', 'qux', 'norf']) self.assertRaises(TypeError, nsw.__sub__, object()) self.assertEqual(sorted(list(nsw)), ['baz', 'norf', 'qux'])
def cmdline(argv=sys.argv[1:]): """ Script for merging different collections of stop words. """ parser = ArgumentParser( description='Create and merge collections of stop words') parser.add_argument('language', help='The language used in the collection') parser.add_argument('sources', metavar='FILE', nargs='+', help='Source files to parse') options = parser.parse_args(argv) factory = StopWordFactory() language = options.language stop_words = factory.get_stop_words(language, fail_safe=True) for filename in options.sources: stop_words += StopWord(language, factory.read_collection(filename)) filename = factory.get_collection_filename(stop_words.language) factory.write_collection(filename, stop_words.collection)
def check_stop_word_rebase(self, inpout, outpout, sept, char=None): sw = StopWord('test', sept) if char is None: self.assertEqual(sw.rebase(inpout), outpout) else: self.assertEqual(sw.rebase(inpout, char), outpout)
def setUp(self): self.sw = StopWord('foo', ['foo', 'bar', 'baz'])