Ejemplo n.º 1
0
    def get_stop_words(self, language, fail_safe=False):
        """
        Returns a StopWord object initialized with the stop words collection
        requested by ``language``.
        If the requested language is not available a StopWordError is raised.
        If ``fail_safe`` is set to True, an empty StopWord object is returned.
        """
        try:
            language = self.language_codes[language]
        except KeyError:
            pass

        collection = self.LOADED_LANGUAGES_CACHE.get(language)

        if collection is None:
            try:
                collection = self._get_stop_words(language)
                self.LOADED_LANGUAGES_CACHE[language] = collection
            except StopWordError as error:
                if not fail_safe:
                    raise error
                collection = []

        stop_words = StopWord(language, collection)
        return stop_words
Ejemplo n.º 2
0
    def test_sub(self):
        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw = nsw - self.sw
        self.assertEqual(sorted(list(nsw)), ['norf', 'qux'])
        self.assertEqual(nsw.language, 'bar')

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw -= self.sw
        self.assertEqual(sorted(list(nsw)), ['norf', 'qux'])
        self.assertEqual(nsw.language, 'bar')

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw -= ['tic', 'tac', 'toc', 'qux']
        self.assertEqual(sorted(list(nsw)), ['baz', 'norf'])

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw -= 'baz'
        self.assertEqual(sorted(list(nsw)), ['norf', 'qux'])

        if sys.version_info[0] == 2:
            nsw = StopWord('bar', ['baz', 'qux', 'norf'])
            nsw -= 'baz'.decode('utf-8')
            self.assertEqual(sorted(list(nsw)), ['norf', 'qux'])

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        self.assertRaises(TypeError, nsw.__sub__, object())
        self.assertEqual(sorted(list(nsw)), ['baz', 'norf', 'qux'])
Ejemplo n.º 3
0
def cmdline(argv=sys.argv[1:]):
    """
    Script for merging different collections of stop words.
    """
    parser = ArgumentParser(
        description='Create and merge collections of stop words')
    parser.add_argument('language', help='The language used in the collection')
    parser.add_argument('sources',
                        metavar='FILE',
                        nargs='+',
                        help='Source files to parse')
    options = parser.parse_args(argv)

    factory = StopWordFactory()
    language = options.language
    stop_words = factory.get_stop_words(language, fail_safe=True)

    for filename in options.sources:
        stop_words += StopWord(language, factory.read_collection(filename))

    filename = factory.get_collection_filename(stop_words.language)
    factory.write_collection(filename, stop_words.collection)
Ejemplo n.º 4
0
 def check_stop_word_rebase(self, inpout, outpout, sept, char=None):
     sw = StopWord('test', sept)
     if char is None:
         self.assertEqual(sw.rebase(inpout), outpout)
     else:
         self.assertEqual(sw.rebase(inpout, char), outpout)
Ejemplo n.º 5
0
 def setUp(self):
     self.sw = StopWord('foo', ['foo', 'bar', 'baz'])
Ejemplo n.º 6
0
class StopWordTestCase(TestCase):
    def setUp(self):
        self.sw = StopWord('foo', ['foo', 'bar', 'baz'])

    def test_len(self):
        self.assertEqual(len(self.sw), 3)

    def test_contains(self):
        self.assertTrue('foo' in self.sw)
        self.assertFalse('qux' in self.sw)

    def test_iter(self):
        self.assertEqual(len(list(self.sw)), 3)

    def test_add(self):
        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw = nsw + self.sw
        self.assertEqual(sorted(list(nsw)),
                         ['bar', 'baz', 'foo', 'norf', 'qux'])
        self.assertEqual(nsw.language, 'bar')

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw += self.sw
        self.assertEqual(sorted(list(nsw)),
                         ['bar', 'baz', 'foo', 'norf', 'qux'])
        self.assertEqual(nsw.language, 'bar')

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw += ['tic', 'tac', 'toc']
        self.assertEqual(sorted(list(nsw)),
                         ['baz', 'norf', 'qux', 'tac', 'tic', 'toc'])

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw += 'tic'
        self.assertEqual(sorted(list(nsw)), ['baz', 'norf', 'qux', 'tic'])

        if sys.version_info[0] == 2:
            nsw = StopWord('bar', ['baz', 'qux', 'norf'])
            nsw += 'tic'.decode('utf-8')
            self.assertEqual(sorted(list(nsw)), ['baz', 'norf', 'qux', 'tic'])

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        self.assertRaises(TypeError, nsw.__add__, object())
        self.assertEqual(sorted(list(nsw)), ['baz', 'norf', 'qux'])

    def test_sub(self):
        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw = nsw - self.sw
        self.assertEqual(sorted(list(nsw)), ['norf', 'qux'])
        self.assertEqual(nsw.language, 'bar')

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw -= self.sw
        self.assertEqual(sorted(list(nsw)), ['norf', 'qux'])
        self.assertEqual(nsw.language, 'bar')

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw -= ['tic', 'tac', 'toc', 'qux']
        self.assertEqual(sorted(list(nsw)), ['baz', 'norf'])

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        nsw -= 'baz'
        self.assertEqual(sorted(list(nsw)), ['norf', 'qux'])

        if sys.version_info[0] == 2:
            nsw = StopWord('bar', ['baz', 'qux', 'norf'])
            nsw -= 'baz'.decode('utf-8')
            self.assertEqual(sorted(list(nsw)), ['norf', 'qux'])

        nsw = StopWord('bar', ['baz', 'qux', 'norf'])
        self.assertRaises(TypeError, nsw.__sub__, object())
        self.assertEqual(sorted(list(nsw)), ['baz', 'norf', 'qux'])

    def test_str(self):
        self.assertEqual(self.sw.__str__(), 'Foo stop words: 3 words')

    def test_repr(self):
        self.assertEqual(self.sw.__repr__(),
                         "Foo stop words: ['bar', 'baz', 'foo']")
Ejemplo n.º 7
0
 def check_stop_word_rebase(self, inpout, outpout, sept, char=None):
     sw = StopWord("test", sept)
     if char is None:
         self.assertEqual(sw.rebase(inpout), outpout)
     else:
         self.assertEqual(sw.rebase(inpout, char), outpout)
Ejemplo n.º 8
0
 def setUp(self):
     self.sw = StopWord("foo", ["foo", "bar", "baz"])
Ejemplo n.º 9
0
class StopWordTestCase(TestCase):
    def setUp(self):
        self.sw = StopWord("foo", ["foo", "bar", "baz"])

    def test_len(self):
        self.assertEqual(len(self.sw), 3)

    def test_contains(self):
        self.assertTrue("foo" in self.sw)
        self.assertFalse("qux" in self.sw)

    def test_iter(self):
        self.assertEqual(len(list(self.sw)), 3)

    def test_add(self):
        nsw = StopWord("bar", ["baz", "qux", "norf"])
        nsw = nsw + self.sw
        self.assertEqual(sorted(list(nsw)), ["bar", "baz", "foo", "norf", "qux"])
        self.assertEqual(nsw.language, "bar")

        nsw = StopWord("bar", ["baz", "qux", "norf"])
        nsw += self.sw
        self.assertEqual(sorted(list(nsw)), ["bar", "baz", "foo", "norf", "qux"])
        self.assertEqual(nsw.language, "bar")

        nsw = StopWord("bar", ["baz", "qux", "norf"])
        nsw += ["tic", "tac", "toc"]
        self.assertEqual(sorted(list(nsw)), ["baz", "norf", "qux", "tac", "tic", "toc"])

        nsw = StopWord("bar", ["baz", "qux", "norf"])
        nsw += "tic"
        self.assertEqual(sorted(list(nsw)), ["baz", "norf", "qux", "tic"])

        if sys.version_info[0] == 2:
            nsw = StopWord("bar", ["baz", "qux", "norf"])
            nsw += "tic".decode("utf-8")
            self.assertEqual(sorted(list(nsw)), ["baz", "norf", "qux", "tic"])

        nsw = StopWord("bar", ["baz", "qux", "norf"])
        self.assertRaises(TypeError, nsw.__add__, object())
        self.assertEqual(sorted(list(nsw)), ["baz", "norf", "qux"])

    def test_sub(self):
        nsw = StopWord("bar", ["baz", "qux", "norf"])
        nsw = nsw - self.sw
        self.assertEqual(sorted(list(nsw)), ["norf", "qux"])
        self.assertEqual(nsw.language, "bar")

        nsw = StopWord("bar", ["baz", "qux", "norf"])
        nsw -= self.sw
        self.assertEqual(sorted(list(nsw)), ["norf", "qux"])
        self.assertEqual(nsw.language, "bar")

        nsw = StopWord("bar", ["baz", "qux", "norf"])
        nsw -= ["tic", "tac", "toc", "qux"]
        self.assertEqual(sorted(list(nsw)), ["baz", "norf"])

        nsw = StopWord("bar", ["baz", "qux", "norf"])
        nsw -= "baz"
        self.assertEqual(sorted(list(nsw)), ["norf", "qux"])

        if sys.version_info[0] == 2:
            nsw = StopWord("bar", ["baz", "qux", "norf"])
            nsw -= "baz".decode("utf-8")
            self.assertEqual(sorted(list(nsw)), ["norf", "qux"])

        nsw = StopWord("bar", ["baz", "qux", "norf"])
        self.assertRaises(TypeError, nsw.__sub__, object())
        self.assertEqual(sorted(list(nsw)), ["baz", "norf", "qux"])

    def test_str(self):
        self.assertEqual(self.sw.__str__(), "Foo stop words: 3 words")

    def test_repr(self):
        self.assertEqual(self.sw.__repr__(), "Foo stop words: ['bar', 'baz', 'foo']")