def test_stop_word_rebase_functional(self):
        current_dir = os.path.dirname(__file__)
        file_name = os.path.join(current_dir, "corpus", "french.txt")
        file_content = open(file_name, "rb").read().decode("utf-8")
        solution_name = os.path.join(current_dir, "corpus", "french_solution.txt")
        solution_content = open(solution_name, "rb").read().decode("utf-8")

        factory = StopWordFactory()
        stop_words = factory.get_stop_words("fr")
        file_content_rebased = stop_words.rebase(file_content)
        self.assertEqual(file_content_rebased, solution_content)
    def test_stop_word_rebase_functional(self):
        current_dir = os.path.dirname(__file__)
        file_name = os.path.join(current_dir, 'corpus', 'french.txt')
        file_content = open(file_name, 'rb').read().decode('utf-8')
        solution_name = os.path.join(current_dir, 'corpus',
                                     'french_solution.txt')
        solution_content = open(solution_name, 'rb').read().decode('utf-8')

        factory = StopWordFactory()
        stop_words = factory.get_stop_words('fr')
        file_content_rebased = stop_words.rebase(file_content)
        self.assertEqual(file_content_rebased, solution_content)
Beispiel #3
0
def cmdline(argv=sys.argv[1:]):
    """
    Script for rebasing a text file
    """
    parser = ArgumentParser(description='Rebase a text from his stop words')
    parser.add_argument('language', help='The language used to rebase')
    parser.add_argument('source', help='Text file to rebase')
    options = parser.parse_args(argv)

    factory = StopWordFactory()
    language = options.language
    stop_words = factory.get_stop_words(language, fail_safe=True)
    content = open(options.source, 'rb').read().decode('utf-8')
    print(stop_words.rebase(content))
def cmdline(argv=sys.argv[1:]):
    """
    Script for rebasing a text file
    """
    parser = ArgumentParser(
        description='Rebase a text from his stop words')
    parser.add_argument('language', help='The language used to rebase')
    parser.add_argument('source', help='Text file to rebase')
    options = parser.parse_args(argv)

    factory = StopWordFactory()
    language = options.language
    stop_words = factory.get_stop_words(language, fail_safe=True)
    content = open(options.source, 'rb').read().decode('utf-8')
    print(stop_words.rebase(content))
 def setUp(self):
     self.data_directory = os.path.join(
         os.path.dirname(
             os.path.abspath(__file__)),
         'datas/')
     self.factory = StopWordFactory(self.data_directory,
                                    {'kl': 'klingon',
                                     'si': 'sindarin'})
Beispiel #6
0
def cmdline(argv=sys.argv[1:]):
    """
    Script for merging different collections of stop words.
    """
    parser = ArgumentParser(
        description='Create and merge collections of stop words')
    parser.add_argument('language', help='The language used in the collection')
    parser.add_argument('sources',
                        metavar='FILE',
                        nargs='+',
                        help='Source files to parse')
    options = parser.parse_args(argv)

    factory = StopWordFactory()
    language = options.language
    stop_words = factory.get_stop_words(language, fail_safe=True)

    for filename in options.sources:
        stop_words += StopWord(language, factory.read_collection(filename))

    filename = factory.get_collection_filename(stop_words.language)
    factory.write_collection(filename, stop_words.collection)
def cmdline(argv=sys.argv[1:]):
    """
    Script for merging different collections of stop words.
    """
    parser = ArgumentParser(
        description='Create and merge collections of stop words')
    parser.add_argument(
        'language', help='The language used in the collection')
    parser.add_argument('sources', metavar='FILE', nargs='+',
                        help='Source files to parse')
    options = parser.parse_args(argv)

    factory = StopWordFactory()
    language = options.language
    stop_words = factory.get_stop_words(language, fail_safe=True)

    for filename in options.sources:
        stop_words += StopWord(language, factory.read_collection(filename))

    filename = factory.get_collection_filename(stop_words.language)
    factory.write_collection(filename, stop_words.collection)
Beispiel #8
0
"""
Mots-vides
"""
from mots_vides.stop_words import StopWord
from mots_vides.factory import StopWordFactory
from mots_vides.exceptions import StopWordError

__all__ = ['StopWord', 'StopWordFactory', 'StopWordError',
           'default_factory', 'stop_words']

default_factory = StopWordFactory()


def stop_words(language, fail_safe=True):
    """
    Shortcut for getting stop words
    without initializing a factory.
    """
    return default_factory.get_stop_words(
        language, fail_safe=fail_safe)
class StopWordFactoryTestCase(TestCase):

    def setUp(self):
        self.data_directory = os.path.join(
            os.path.dirname(
                os.path.abspath(__file__)),
            'datas/')
        self.factory = StopWordFactory(self.data_directory,
                                       {'kl': 'klingon',
                                        'si': 'sindarin'})

    def test_get_stopwords(self):
        sw = self.factory.get_stop_words('klingon')
        self.assertTrue(isinstance(sw, StopWord))
        self.assertEqual(sorted(list(sw.collection)),
                         ["HIja'", "ghobe'", 'naDev', 'nuq'])

    def test_get_stopwords_shortcuts(self):
        sw = self.factory.get_stop_words('kl')
        self.assertEqual(sorted(list(sw.collection)),
                         ["HIja'", "ghobe'", 'naDev', 'nuq'])

    def test_get_stopwords_unavailable_language(self):
        self.assertRaises(StopWordError, self.factory.get_stop_words, 'vulcan')
        sw = self.factory.get_stop_words('vulcan', fail_safe=True)
        self.assertEqual(list(sw.collection), [])

    def test_get_stopwords_file_unreadable(self):
        self.factory.available_languages  # Fill the cache, pass security
        self.factory.data_directory = '/brutal/change/'
        self.assertRaises(StopWordError,
                          self.factory.get_stop_words, 'klingon')
        sw = self.factory.get_stop_words('klingon', fail_safe=True)
        self.assertEqual(list(sw.collection), [])

    def test_get_stopwords_cache(self):
        self.assertEqual(self.factory.LOADED_LANGUAGES_CACHE, {})
        self.factory.get_stop_words('klingon')
        self.assertEqual(list(self.factory.LOADED_LANGUAGES_CACHE.keys()),
                         ['klingon'])
        sw = self.factory.get_stop_words('kl')
        self.assertEqual(list(self.factory.LOADED_LANGUAGES_CACHE.keys()),
                         ['klingon'])
        self.factory.data_directory = '/brutal/change/'
        self.assertEqual(sw.collection,
                         self.factory.get_stop_words('klingon').collection)

    def test_get_stopwords_cache_and_errors(self):
        self.assertRaises(StopWordError, self.factory.get_stop_words, 'vulcan')
        self.assertRaises(StopWordError, self.factory.get_stop_words, 'vulcan')
        self.assertEqual(self.factory.LOADED_LANGUAGES_CACHE, {})
        self.factory.get_stop_words('vulcan', fail_safe=True)
        self.assertEqual(self.factory.LOADED_LANGUAGES_CACHE, {})
        self.assertRaises(StopWordError, self.factory.get_stop_words, 'vulcan')

    def test_get_stopwords_cache_twice_python3(self):
        sw = self.factory.get_stop_words('klingon')
        self.assertEquals(len(sw), len(self.factory.get_stop_words('klingon')))

    def test_available_languages(self):
        self.assertEqual(self.factory.available_languages,
                         ['klingon', 'sindarin'])
        self.factory.data_directory = '/brutal/change/'
        self.assertEqual(self.factory.available_languages,
                         ['klingon', 'sindarin'])

    def test_available_languages_error(self):
        self.factory.data_directory = '/brutal/change/'
        self.assertRaises(StopWordError,
                          lambda: self.factory.available_languages)

    def test_get_collection_filename(self):
        filename = self.factory.get_collection_filename('foo')
        self.assertTrue(filename.endswith('foo.txt'))
        self.assertTrue(filename.startswith(self.data_directory))

    def test_read_collection(self):
        collection_file = NamedTemporaryFile()
        collection_text = 'egor\n\n   \nai\n'
        collection_file.write(collection_text.encode('utf-8'))
        collection_file.seek(0)
        collection = self.factory.read_collection(collection_file.name)
        self.assertEqual(list(collection), ['egor', 'ai'])
        collection_file.close()

    def test_write_collection(self):
        collection_file = NamedTemporaryFile()
        self.factory.write_collection(
            collection_file.name,
            ['nuq', "HIja'", "ghobe'", 'naDev'])
        collection_file.seek(0)
        self.assertEqual(collection_file.read().decode('utf-8'),
                         "HIja'\nghobe'\nnaDev\nnuq")
        collection_file.close()