def test_detect_language_empty_text_input(self):
        language_detector = LanguageDetector((3, ), 10)

        encoded_text = ()
        expected = {}
        actual = language_detector.detect_language(encoded_text)
        self.assertEqual(expected, actual)
    def test_new_language_incorrect_input(self):
        language_detector = LanguageDetector((3, ), 10)

        expected = 1
        bad_inputs = [[], {}, '', 123, None, True, (None, )]
        for bad_input in bad_inputs:
            actual = language_detector.new_language(bad_input, 'english')
            self.assertEqual(expected, actual)
    def test_detect_language_not_filled_ngram_storages(self):
        language_detector = LanguageDetector((2, 3), 100)

        patches_sentence = (((1, 2, 3), ), )

        expected = {}
        actual = language_detector.detect_language(patches_sentence)
        self.assertEqual(expected, actual)
    def test_detect_language_incorrect_text_input(self):
        language_detector = LanguageDetector((3, ), 10)

        expected = {}
        bad_inputs = [[], {}, set(), '', 123, -1, 0.7, None, True, (None, )]
        for bad_input in bad_inputs:
            actual = language_detector.detect_language(bad_input)
            self.assertEqual(expected, actual)
    def test_calculate_distance_ideal(self):
        language_detector = LanguageDetector((2, 3), 10)

        first_n_grams = ((1, 2), (3, 4), (7, 8), (9, 10), (5, 6), (13, 14))
        second_n_grams = ((1, 2), (5, 6), (7, 8), (3, 4), (11, 12), (15, 16))

        expected = 17
        actual = language_detector._calculate_distance(first_n_grams, second_n_grams)
        self.assertEqual(expected, actual)
    def test_new_language_incorrect_language_input(self):
        language_detector = LanguageDetector((3,), 10)

        expected = 1
        patches_encoded_text = (((),),)
        bad_inputs = [[], {}, (), 123, None, True]
        for bad_input in bad_inputs:
            actual = language_detector.new_language(patches_encoded_text, bad_input)
            self.assertEqual(expected, actual)
    def test_calculate_distance_is_empty(self):
        language_detector = LanguageDetector((3, ), 10)

        empty_sentence = ()
        patches_sentence = ('th', 'er', 'on', 'le', 'ing', 'and')
        expected = 0
        actual_first = language_detector._calculate_distance(
            empty_sentence, patches_sentence)
        actual_second = language_detector._calculate_distance(
            patches_sentence, empty_sentence)
        self.assertEqual(expected, actual_first)
        self.assertEqual(expected, actual_second)
    def test_new_language_creates_several_ngrams(self):
        letter_storage = LetterStorage()
        language_detector = LanguageDetector((2, 3), 10)

        file = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8')
        text = tokenize_by_sentence(file.read())
        letter_storage.update(text)
        encoded_text = encode_corpus(letter_storage, text)
        file.close()

        language_detector.new_language(encoded_text, 'english')
        self.assertTrue(language_detector.n_gram_storages['english'][2])
        self.assertTrue(language_detector.n_gram_storages['english'][3])
    def test_calculate_distance_incorrect_input(self):
        language_detector = LanguageDetector((3, ), 10)

        patches_ngrams = ((1, 2), (3, 4), (7, 8), (9, 10), (5, 6), (13, 14))

        expected = -1
        bad_inputs = [[], {}, '', 1, -1, 9.22, None, True, (None, )]
        for bad_input in bad_inputs:
            actual_first = language_detector._calculate_distance(
                bad_input, patches_ngrams)
            actual_second = language_detector._calculate_distance(
                patches_ngrams, bad_input)
            self.assertEqual(expected, actual_first)
            self.assertEqual(expected, actual_second)
    def test_new_language_add_existing_language(self):
        letter_storage = LetterStorage()
        language_detector = LanguageDetector((3, ), 10)

        file = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8')
        text = tokenize_by_sentence(file.read())
        letter_storage.update(text)
        encoded_text = encode_corpus(letter_storage, text)
        file.close()

        expected = 0
        language_detector.new_language(encoded_text, 'german')
        actual = language_detector.new_language(encoded_text, 'german')
        self.assertEqual(expected, actual)
    def test_detect_language_uses_several_ngrams(self):
        letter_storage = LetterStorage()
        language_detector = LanguageDetector((2, 3), 100)

        file_first = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8')
        file_second = open('lab_3/Thomas_Mann.txt', 'r', encoding='utf-8')
        file_third = open('lab_3/unknown_Arthur_Conan_Doyle.txt',
                          'r',
                          encoding='utf-8')

        text_english = tokenize_by_sentence(file_first.read())
        text_german = tokenize_by_sentence(file_second.read())
        text_unknown = tokenize_by_sentence(file_third.read())
        letter_storage.update(text_english)
        letter_storage.update(text_german)
        letter_storage.update(text_unknown)
        encoded_english = encode_corpus(letter_storage, text_english)
        encoded_german = encode_corpus(letter_storage, text_german)
        encoded_unknown = encode_corpus(letter_storage, text_unknown)
        file_first.close()
        file_second.close()
        file_third.close()

        language_detector.new_language(encoded_english, 'english')
        language_detector.new_language(encoded_german, 'german')

        actual = language_detector.detect_language(encoded_unknown)
        self.assertTrue(actual['german'] > actual['english'])
    def test_new_language_storage_already_created(self):
        letter_storage = LetterStorage()
        language_detector = LanguageDetector((3, ), 10)

        file = open('lab_3/Thomas_Mann.txt', 'r', encoding='utf-8')
        file_unknown = open('lab_3/unknown_Arthur_Conan_Doyle.txt',
                            'r',
                            encoding='utf-8')
        text = tokenize_by_sentence(file.read())
        text_unknown = tokenize_by_sentence(file_unknown.read())
        letter_storage.update(text)
        letter_storage.update(text_unknown)
        encoded_text = encode_corpus(letter_storage, text)
        encoded_unknown_text = encode_corpus(letter_storage, text_unknown)
        file.close()
        file_unknown.close()

        language_detector.new_language(encoded_text, 'german')
        language_detector.new_language(encoded_unknown_text, 'english')
        self.assertTrue(language_detector.n_gram_storages['german'])
        self.assertTrue(language_detector.n_gram_storages['english'])
        self.assertEqual(type(language_detector.n_gram_storages['german'][3]),
                         NGramTrie)
        self.assertEqual(type(language_detector.n_gram_storages['english'][3]),
                         NGramTrie)
 def test_detect_language_calls_required_method(self, mock):
     letter_storage = LetterStorage()
     language_detector = LanguageDetector((3, ), 10)
     text_to_detect = (((1, 2, 3), ), )
     file = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8')
     text = tokenize_by_sentence(file.read())
     letter_storage.update(text)
     encoded_text = encode_corpus(letter_storage, text)
     file.close()
     language_detector.new_language(encoded_text, 'english')
     language_detector.detect_language(text_to_detect)
     self.assertTrue(mock.called)
 def test_language_detector_creates_correctly(self):
     language_detector = LanguageDetector((3, ), 10)
     self.assertEqual(language_detector.trie_levels, (3, ))
     self.assertEqual(language_detector.top_k, 10)
     self.assertEqual(language_detector.n_gram_storages, {})
class LanguageDetectorTest(unittest.TestCase):
    """
    Checks for LanguageDetector class
    """
    def test_language_detector_creates_correctly(self):
        language_detector = LanguageDetector((3, ), 10)
        self.assertEqual(language_detector.trie_levels, (3, ))
        self.assertEqual(language_detector.top_k, 10)
        self.assertEqual(language_detector.n_gram_storages, {})

# ---------------------------------------------------------------

    def test_new_language_ideal_case(self):
        letter_storage = LetterStorage()
        language_detector = LanguageDetector((3, ), 10)

        file = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8')
        text = tokenize_by_sentence(file.read())
        letter_storage.update(text)
        encoded_text = encode_corpus(letter_storage, text)
        file.close()

        language_detector.new_language(encoded_text, 'english')
        self.assertTrue(language_detector.n_gram_storages['english'])
        self.assertEqual(type(language_detector.n_gram_storages['english'][3]),
                         NGramTrie)

    def test_new_language_incorrect_input(self):
        language_detector = LanguageDetector((3, ), 10)

        expected = 1
        bad_inputs = [[], {}, '', 123, None, True, (None, )]
        for bad_input in bad_inputs:
            actual = language_detector.new_language(bad_input, 'english')
            self.assertEqual(expected, actual)

    def test_new_language_incorrect_language_input(self):
        language_detector = LanguageDetector((3, ), 10)

        expected = 1
        patches_encoded_text = (((), ), )
        bad_inputs = [[], {}, (), 123, None, True]
        for bad_input in bad_inputs:
            actual = language_detector.new_language(patches_encoded_text,
                                                    bad_input)
            self.assertEqual(expected, actual)

    def test_new_language_storage_already_created(self):
        letter_storage = LetterStorage()
        language_detector = LanguageDetector((3, ), 10)

        file = open('lab_3/Thomas_Mann.txt', 'r', encoding='utf-8')
        file_unknown = open('lab_3/unknown_Arthur_Conan_Doyle.txt',
                            'r',
                            encoding='utf-8')
        text = tokenize_by_sentence(file.read())
        text_unknown = tokenize_by_sentence(file_unknown.read())
        letter_storage.update(text)
        letter_storage.update(text_unknown)
        encoded_text = encode_corpus(letter_storage, text)
        encoded_unknown_text = encode_corpus(letter_storage, text_unknown)
        file.close()
        file_unknown.close()

        language_detector.new_language(encoded_text, 'german')
        language_detector.new_language(encoded_unknown_text, 'english')
        self.assertTrue(language_detector.n_gram_storages['german'])
        self.assertTrue(language_detector.n_gram_storages['english'])
        self.assertEqual(type(language_detector.n_gram_storages['german'][3]),
                         NGramTrie)
        self.assertEqual(type(language_detector.n_gram_storages['english'][3]),
                         NGramTrie)

    def test_new_language_add_existing_language(self):
        letter_storage = LetterStorage()
        language_detector = LanguageDetector((3, ), 10)

        file = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8')
        text = tokenize_by_sentence(file.read())
        letter_storage.update(text)
        encoded_text = encode_corpus(letter_storage, text)
        file.close()

        expected = 0
        language_detector.new_language(encoded_text, 'german')
        actual = language_detector.new_language(encoded_text, 'german')
        self.assertEqual(expected, actual)

    def test_new_language_creates_several_ngrams(self):
        letter_storage = LetterStorage()
        language_detector = LanguageDetector((2, 3), 10)

        file = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8')
        text = tokenize_by_sentence(file.read())
        letter_storage.update(text)
        encoded_text = encode_corpus(letter_storage, text)
        file.close()

        language_detector.new_language(encoded_text, 'english')
        self.assertTrue(language_detector.n_gram_storages['english'][2])
        self.assertTrue(language_detector.n_gram_storages['english'][3])

# ----------------------------------------------------------------------------

    def test_calculate_distance_ideal(self):
        language_detector = LanguageDetector((2, 3), 10)

        first_n_grams = ((1, 2), (3, 4), (7, 8), (9, 10), (5, 6), (13, 14))
        second_n_grams = ((1, 2), (5, 6), (7, 8), (3, 4), (11, 12), (15, 16))

        expected = 17
        actual = language_detector._calculate_distance(first_n_grams,
                                                       second_n_grams)
        self.assertEqual(expected, actual)

    def test_calculate_distance_incorrect_input(self):
        language_detector = LanguageDetector((3, ), 10)

        patches_ngrams = ((1, 2), (3, 4), (7, 8), (9, 10), (5, 6), (13, 14))

        expected = -1
        bad_inputs = [[], {}, '', 1, -1, 9.22, None, True, (None, )]
        for bad_input in bad_inputs:
            actual_first = language_detector._calculate_distance(
                bad_input, patches_ngrams)
            actual_second = language_detector._calculate_distance(
                patches_ngrams, bad_input)
            self.assertEqual(expected, actual_first)
            self.assertEqual(expected, actual_second)

    def test_calculate_distance_is_empty(self):
        language_detector = LanguageDetector((3, ), 10)

        empty_sentence = ()
        patches_sentence = ('th', 'er', 'on', 'le', 'ing', 'and')
        expected = 0
        actual_first = language_detector._calculate_distance(
            empty_sentence, patches_sentence)
        actual_second = language_detector._calculate_distance(
            patches_sentence, empty_sentence)
        self.assertEqual(expected, actual_first)
        self.assertEqual(expected, actual_second)

# ------------------------------------------------------------------------------

    def test_detect_language_ideal(self):
        letter_storage = LetterStorage()
        language_detector = LanguageDetector((3, ), 100)

        file_first = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8')
        file_second = open('lab_3/Thomas_Mann.txt', 'r', encoding='utf-8')
        file_third = open('lab_3/unknown_Arthur_Conan_Doyle.txt',
                          'r',
                          encoding='utf-8')

        text_english = tokenize_by_sentence(file_first.read())
        text_german = tokenize_by_sentence(file_second.read())
        text_unknown = tokenize_by_sentence(file_third.read())
        letter_storage.update(text_english)
        letter_storage.update(text_german)
        letter_storage.update(text_unknown)
        encoded_english = encode_corpus(letter_storage, text_english)
        encoded_german = encode_corpus(letter_storage, text_german)
        encoded_unknown = encode_corpus(letter_storage, text_unknown)
        file_first.close()
        file_second.close()
        file_third.close()

        language_detector.new_language(encoded_english, 'english')
        language_detector.new_language(encoded_german, 'german')

        actual = language_detector.detect_language(encoded_unknown)
        self.assertTrue(actual['german'] > actual['english'])

    def test_detect_language_incorrect_text_input(self):
        language_detector = LanguageDetector((3, ), 10)

        expected = {}
        bad_inputs = [[], {}, set(), '', 123, -1, 0.7, None, True, (None, )]
        for bad_input in bad_inputs:
            actual = language_detector.detect_language(bad_input)
            self.assertEqual(expected, actual)

    def test_detect_language_empty_text_input(self):
        language_detector = LanguageDetector((3, ), 10)

        encoded_text = ()
        expected = {}
        actual = language_detector.detect_language(encoded_text)
        self.assertEqual(expected, actual)

    @patch('lab_3.main.LanguageDetector._calculate_distance',
           side_effect=LanguageDetector()._calculate_distance)
    def test_detect_language_calls_required_method(self, mock):
        letter_storage = LetterStorage()
        language_detector = LanguageDetector((3, ), 10)
        text_to_detect = (((1, 2, 3), ), )
        file = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8')
        text = tokenize_by_sentence(file.read())
        letter_storage.update(text)
        encoded_text = encode_corpus(letter_storage, text)
        file.close()
        language_detector.new_language(encoded_text, 'english')
        language_detector.detect_language(text_to_detect)
        self.assertTrue(mock.called)

    def test_detect_language_uses_several_ngrams(self):
        letter_storage = LetterStorage()
        language_detector = LanguageDetector((2, 3), 100)

        file_first = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8')
        file_second = open('lab_3/Thomas_Mann.txt', 'r', encoding='utf-8')
        file_third = open('lab_3/unknown_Arthur_Conan_Doyle.txt',
                          'r',
                          encoding='utf-8')

        text_english = tokenize_by_sentence(file_first.read())
        text_german = tokenize_by_sentence(file_second.read())
        text_unknown = tokenize_by_sentence(file_third.read())
        letter_storage.update(text_english)
        letter_storage.update(text_german)
        letter_storage.update(text_unknown)
        encoded_english = encode_corpus(letter_storage, text_english)
        encoded_german = encode_corpus(letter_storage, text_german)
        encoded_unknown = encode_corpus(letter_storage, text_unknown)
        file_first.close()
        file_second.close()
        file_third.close()

        language_detector.new_language(encoded_english, 'english')
        language_detector.new_language(encoded_german, 'german')

        actual = language_detector.detect_language(encoded_unknown)
        self.assertTrue(actual['german'] > actual['english'])

    def test_detect_language_not_filled_ngram_storages(self):
        language_detector = LanguageDetector((2, 3), 100)

        patches_sentence = (((1, 2, 3), ), )

        expected = {}
        actual = language_detector.detect_language(patches_sentence)
        self.assertEqual(expected, actual)
Beispiel #16
0
    text_ger = tokenize_by_sentence(german_file.read())
    text_eng = tokenize_by_sentence(english_file.read())
    english_file.close()
    german_file.close()
    unknown_file.close()

    letter_storage = LetterStorage()
    letter_storage.update(text_eng)
    letter_storage.update(text_ger)
    letter_storage.update(text_unk)

    eng_encoded = encode_corpus(letter_storage, text_eng)
    unk_encoded = encode_corpus(letter_storage, text_unk)
    ger_encoded = encode_corpus(letter_storage, text_ger)

    language_detector = LanguageDetector((3, 4, 5), 1000)
    language_detector.new_language(eng_encoded, 'english')
    language_detector.new_language(ger_encoded, 'german')

    ngram_unknown = NGramTrie(4)
    ngram_unknown.fill_n_grams(unk_encoded)

    language_log_probability_dict = language_detector.detect_language(ngram_unknown.n_grams)

    if language_log_probability_dict['german'] >\
language_log_probability_dict['english']:
        RESULT = 'english'
    else:
        RESULT = 'german'

    print('this is a {} text.'.format(RESULT))
Beispiel #17
0
"""
Language detector implementation starter
"""

from lab_3.main import tokenize_by_sentence, LanguageDetector
from lab_3.main import encode_corpus
from lab_3.main import LetterStorage

if __name__ == '__main__':

    # here goes your function calls
    letter_storage = LetterStorage()
    language_detector = LanguageDetector((3, ), 100)

    file_first = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8')
    file_second = open('lab_3/Thomas_Mann.txt', 'r', encoding='utf-8')
    file_third = open('lab_3/unknown_Arthur_Conan_Doyle.txt',
                      'r',
                      encoding='utf-8')

    text_english = tokenize_by_sentence(file_first.read())
    text_german = tokenize_by_sentence(file_second.read())
    text_unknown = tokenize_by_sentence(file_third.read())
    letter_storage.update(text_english)
    letter_storage.update(text_german)
    letter_storage.update(text_unknown)
    encoded_english = encode_corpus(letter_storage, text_english)
    encoded_german = encode_corpus(letter_storage, text_german)
    encoded_unknown = encode_corpus(letter_storage, text_unknown)
    file_first.close()
    file_second.close()