Ejemplo n.º 1
0
    def get_stop_words(self, language, fail_safe=False):
        """
        Returns a StopWord object initialized with the stop words collection
        requested by ``language``.
        If the requested language is not available a StopWordError is raised.
        If ``fail_safe`` is set to True, an empty StopWord object is returned.
        """
        try:
            language = self.language_codes[language]
        except KeyError:
            pass

        collection = self.LOADED_LANGUAGES_CACHE.get(language)

        if collection is None:
            try:
                collection = self._get_stop_words(language)
                self.LOADED_LANGUAGES_CACHE[language] = collection
            except StopWordError as error:
                if not fail_safe:
                    raise error
                collection = []

        stop_words = StopWord(language, collection)
        return stop_words
Ejemplo n.º 2
0
    def test_sub(self):
        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw = nsw - self.sw
        self.assertEqual(sorted(list(nsw)), ['norf', 'qux'])
        self.assertEqual(nsw.language, 'bar')

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw -= self.sw
        self.assertEqual(sorted(list(nsw)), ['norf', 'qux'])
        self.assertEqual(nsw.language, 'bar')

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw -= ['tic', 'tac', 'toc', 'qux']
        self.assertEqual(sorted(list(nsw)), ['baz', 'norf'])

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw -= 'baz'
        self.assertEqual(sorted(list(nsw)), ['norf', 'qux'])

        if sys.version_info[0] == 2:
            nsw = StopWord('bar', ['baz', 'qux', 'norf'])
            nsw -= 'baz'.decode('utf-8')
            self.assertEqual(sorted(list(nsw)), ['norf', 'qux'])

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        self.assertRaises(TypeError, nsw.__sub__, object())
        self.assertEqual(sorted(list(nsw)), ['baz', 'norf', 'qux'])
Ejemplo n.º 3
0
def cmdline(argv=sys.argv[1:]):
    """
    Script for merging different collections of stop words.
    """
    parser = ArgumentParser(
        description='Create and merge collections of stop words')
    parser.add_argument('language', help='The language used in the collection')
    parser.add_argument('sources',
                        metavar='FILE',
                        nargs='+',
                        help='Source files to parse')
    options = parser.parse_args(argv)

    factory = StopWordFactory()
    language = options.language
    stop_words = factory.get_stop_words(language, fail_safe=True)

    for filename in options.sources:
        stop_words += StopWord(language, factory.read_collection(filename))

    filename = factory.get_collection_filename(stop_words.language)
    factory.write_collection(filename, stop_words.collection)
Ejemplo n.º 4
0
 def check_stop_word_rebase(self, inpout, outpout, sept, char=None):
     sw = StopWord('test', sept)
     if char is None:
         self.assertEqual(sw.rebase(inpout), outpout)
     else:
         self.assertEqual(sw.rebase(inpout, char), outpout)
Ejemplo n.º 5
0
 def setUp(self):
     self.sw = StopWord('foo', ['foo', 'bar', 'baz'])