Ejemplo n.º 1
0
    def test_probability_language_detector_detect_incorrect(self):
        language_detector = ProbabilityLanguageDetector((3, 5), 100)
        bad_inputs = [[], {}, '', 1, None, True, (None,)]

        expected = {}
        for bad_input in bad_inputs:
            actual = language_detector.detect_language(bad_input)
            self.assertEqual(expected, actual)
Ejemplo n.º 2
0
    def test_probability_language_detector_calculate_probability_incorrect_text(self):
        language_detector = ProbabilityLanguageDetector((2, 3), 10)
        bad_inputs = [[], {}, '', None, True, set()]
        ngram_trie = NGramTrie(5)

        expected = -1.0
        for bad_input in bad_inputs:
            actual = language_detector._calculate_sentence_probability(ngram_trie,
                                                                       bad_input)
            self.assertEqual(expected, actual)
Ejemplo n.º 3
0
    def test_probability_language_detector_calculate_probability_incorrect_storage(self):
        language_detector = ProbabilityLanguageDetector((2, 3), 10)
        bad_inputs = [(), [], {}, '', None, True, set()]
        patches_encoded_unknown = (((),),)

        expected = -1.0
        for bad_input in bad_inputs:
            actual = language_detector._calculate_sentence_probability(bad_input,
                                                                       patches_encoded_unknown)
            self.assertEqual(expected, actual)
Ejemplo n.º 4
0
    def test_probability_language_detector_calls_required_method(self, mock):
        unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8')
        german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8')
        english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8')

        text_unk = tokenize_by_sentence(unknown_file.read())
        text_ger = tokenize_by_sentence(german_file.read())
        text_eng = tokenize_by_sentence(english_file.read())
        english_file.close()
        german_file.close()
        unknown_file.close()

        letter_storage = LetterStorage()
        letter_storage.update(text_eng)
        letter_storage.update(text_ger)
        letter_storage.update(text_unk)

        eng_encoded = encode_corpus(letter_storage, text_eng)
        unk_encoded = encode_corpus(letter_storage, text_unk)
        ger_encoded = encode_corpus(letter_storage, text_ger)

        language_detector = ProbabilityLanguageDetector((3, 4, 5), 1000)
        language_detector.new_language(eng_encoded, 'english')
        language_detector.new_language(ger_encoded, 'german')

        ngram_unknown = NGramTrie(4)
        ngram_unknown.fill_n_grams(unk_encoded)

        language_detector.detect_language(ngram_unknown.n_grams)
        self.assertTrue(mock.called)
Ejemplo n.º 5
0
    def test_probability_language_detector_detect_language_ideal(self):
        unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt',
                            encoding='utf-8')
        german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8')
        english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8')

        text_unk = tokenize_by_sentence(unknown_file.read())
        text_ger = tokenize_by_sentence(german_file.read())
        text_eng = tokenize_by_sentence(english_file.read())
        english_file.close()
        german_file.close()
        unknown_file.close()

        letter_storage = LetterStorage()
        letter_storage.update(text_eng)
        letter_storage.update(text_ger)
        letter_storage.update(text_unk)

        eng_encoded = encode_corpus(letter_storage, text_eng)
        unk_encoded = encode_corpus(letter_storage, text_unk)
        ger_encoded = encode_corpus(letter_storage, text_ger)

        language_detector = ProbabilityLanguageDetector((3, 4, 5), 1000)
        language_detector.new_language(eng_encoded, 'english')
        language_detector.new_language(ger_encoded, 'german')

        actual = language_detector.detect_language(unk_encoded)
        self.assertTrue(actual['german'] > actual['english'])
Ejemplo n.º 6
0
    def test_probability_language_detector_calculate_probability_ideal(self):
        print('launching test')
        english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8')
        german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8')
        unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8')

        english_text = tokenize_by_sentence(english_file.read())
        german_text = tokenize_by_sentence(german_file.read())
        unknown_text = tokenize_by_sentence(unknown_file.read())

        english_file.close()
        german_file.close()
        unknown_file.close()

        letter_storage = LetterStorage()
        letter_storage.update(english_text)
        letter_storage.update(german_text)
        letter_storage.update(unknown_text)

        english_encoded = encode_corpus(letter_storage, english_text)
        german_encoded = encode_corpus(letter_storage, german_text)
        unknown_encoded = encode_corpus(letter_storage, unknown_text)

        language_detector = ProbabilityLanguageDetector((3,), 1000)
        language_detector.new_language(english_encoded, 'english')
        language_detector.new_language(german_encoded, 'german')

        n3_gram_trie_english = language_detector.n_gram_storages['english'][3]
        n3_gram_trie_german = language_detector.n_gram_storages['german'][3]

        n3_gram_unknown = NGramTrie(3)
        n3_gram_unknown.fill_n_grams(unknown_encoded)

        english_prob = language_detector._calculate_sentence_probability(n3_gram_trie_english,
                                                                         n3_gram_unknown.n_grams)
        german_prob = language_detector._calculate_sentence_probability(n3_gram_trie_german,
                                                                        n3_gram_unknown.n_grams)
        print(f'English_sentence_prob: {english_prob}')
        print(f'Deutsch_sentence_prob: {german_prob}')
        self.assertTrue(english_prob > german_prob)
Ejemplo n.º 7
0
    def test_probability_language_detector_several_ngrams_case(self):
        language_detector = ProbabilityLanguageDetector((3, 5), 1000)

        english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8')
        german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8')
        unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt',
                            encoding='utf-8')

        eng_text = tokenize_by_sentence(english_file.read())
        ger_text = tokenize_by_sentence(german_file.read())
        unk_text = tokenize_by_sentence(unknown_file.read())

        english_file.close()
        german_file.close()
        unknown_file.close()

        letter_storage = LetterStorage()
        letter_storage.update(eng_text)
        letter_storage.update(ger_text)
        letter_storage.update(unk_text)

        english_encoded = encode_corpus(letter_storage, eng_text)
        german_encoded = encode_corpus(letter_storage, ger_text)
        unknown_encoded = encode_corpus(letter_storage, unk_text)

        language_detector.new_language(english_encoded, 'english')
        language_detector.new_language(german_encoded, 'german')

        eng_prob = language_detector.n_gram_storages['english'][5]
        ger_prob = language_detector.n_gram_storages['german'][5]

        ngram_trie = NGramTrie(5)
        ngram_trie.fill_n_grams(unknown_encoded)

        eng = language_detector._calculate_sentence_probability(
            eng_prob, ngram_trie.n_grams)
        ger = language_detector._calculate_sentence_probability(
            ger_prob, ngram_trie.n_grams)
        self.assertTrue(ger > eng)
Ejemplo n.º 8
0
 def test_probability_language_detector_check_creation(self):
     language_detector = ProbabilityLanguageDetector((3, 5), 10)
     self.assertEqual(language_detector.trie_levels, (3, 5))
     self.assertEqual(language_detector.top_k, 10)
     self.assertEqual(language_detector.n_gram_storages, {})
Ejemplo n.º 9
0
class ProbabilityLanguageDetectorTest(unittest.TestCase):
    """
    Checks for ProbabilityLanguageDetector class
    """

    def test_probability_language_detector_check_creation(self):
        language_detector = ProbabilityLanguageDetector((3, 5), 10)
        self.assertEqual(language_detector.trie_levels, (3, 5))
        self.assertEqual(language_detector.top_k, 10)
        self.assertEqual(language_detector.n_gram_storages, {})

    def test_probability_language_detector_calculate_probability_ideal(self):
        print('launching test')
        english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8')
        german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8')
        unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8')

        english_text = tokenize_by_sentence(english_file.read())
        german_text = tokenize_by_sentence(german_file.read())
        unknown_text = tokenize_by_sentence(unknown_file.read())

        english_file.close()
        german_file.close()
        unknown_file.close()

        letter_storage = LetterStorage()
        letter_storage.update(english_text)
        letter_storage.update(german_text)
        letter_storage.update(unknown_text)

        english_encoded = encode_corpus(letter_storage, english_text)
        german_encoded = encode_corpus(letter_storage, german_text)
        unknown_encoded = encode_corpus(letter_storage, unknown_text)

        language_detector = ProbabilityLanguageDetector((3,), 1000)
        language_detector.new_language(english_encoded, 'english')
        language_detector.new_language(german_encoded, 'german')

        n3_gram_trie_english = language_detector.n_gram_storages['english'][3]
        n3_gram_trie_german = language_detector.n_gram_storages['german'][3]

        n3_gram_unknown = NGramTrie(3)
        n3_gram_unknown.fill_n_grams(unknown_encoded)

        english_prob = language_detector._calculate_sentence_probability(n3_gram_trie_english,
                                                                         n3_gram_unknown.n_grams)
        german_prob = language_detector._calculate_sentence_probability(n3_gram_trie_german,
                                                                        n3_gram_unknown.n_grams)
        print(f'English_sentence_prob: {english_prob}')
        print(f'Deutsch_sentence_prob: {german_prob}')
        self.assertTrue(english_prob > german_prob)

    def test_probability_language_detector_calculate_probability_incorrect_storage(self):
        language_detector = ProbabilityLanguageDetector((2, 3), 10)
        bad_inputs = [(), [], {}, '', None, True, set()]
        patches_encoded_unknown = (((),),)

        expected = -1.0
        for bad_input in bad_inputs:
            actual = language_detector._calculate_sentence_probability(bad_input,
                                                                       patches_encoded_unknown)
            self.assertEqual(expected, actual)

    def test_probability_language_detector_calculate_probability_incorrect_text(self):
        language_detector = ProbabilityLanguageDetector((2, 3), 10)
        bad_inputs = [[], {}, '', None, True, set()]
        ngram_trie = NGramTrie(5)

        expected = -1.0
        for bad_input in bad_inputs:
            actual = language_detector._calculate_sentence_probability(ngram_trie,
                                                                       bad_input)
            self.assertEqual(expected, actual)

    def test_probability_language_detector_several_ngrams_case(self):
        language_detector = ProbabilityLanguageDetector((3, 5), 1000)

        english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8')
        german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8')
        unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8')

        eng_text = tokenize_by_sentence(english_file.read())
        ger_text = tokenize_by_sentence(german_file.read())
        unk_text = tokenize_by_sentence(unknown_file.read())

        english_file.close()
        german_file.close()
        unknown_file.close()

        letter_storage = LetterStorage()
        letter_storage.update(eng_text)
        letter_storage.update(ger_text)
        letter_storage.update(unk_text)

        english_encoded = encode_corpus(letter_storage, eng_text)
        german_encoded = encode_corpus(letter_storage, ger_text)
        unknown_encoded = encode_corpus(letter_storage, unk_text)

        language_detector.new_language(english_encoded, 'english')
        language_detector.new_language(german_encoded, 'german')

        eng_prob = language_detector.n_gram_storages['english'][5]
        ger_prob = language_detector.n_gram_storages['german'][5]

        ngram_trie = NGramTrie(5)
        ngram_trie.fill_n_grams(unknown_encoded)

        eng = language_detector._calculate_sentence_probability(eng_prob, ngram_trie.n_grams)
        ger = language_detector._calculate_sentence_probability(ger_prob, ngram_trie.n_grams)
        self.assertTrue(ger > eng)

    def test_probability_language_detector_detect_language_ideal(self):
        unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8')
        german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8')
        english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8')

        text_unk = tokenize_by_sentence(unknown_file.read())
        text_ger = tokenize_by_sentence(german_file.read())
        text_eng = tokenize_by_sentence(english_file.read())
        english_file.close()
        german_file.close()
        unknown_file.close()

        letter_storage = LetterStorage()
        letter_storage.update(text_eng)
        letter_storage.update(text_ger)
        letter_storage.update(text_unk)

        eng_encoded = encode_corpus(letter_storage, text_eng)
        unk_encoded = encode_corpus(letter_storage, text_unk)
        ger_encoded = encode_corpus(letter_storage, text_ger)

        language_detector = ProbabilityLanguageDetector((3, 4, 5), 1000)
        language_detector.new_language(eng_encoded, 'english')
        language_detector.new_language(ger_encoded, 'german')

        ngram_unknown = NGramTrie(4)
        ngram_unknown.fill_n_grams(unk_encoded)

        actual = language_detector.detect_language(ngram_unknown.n_grams)
        self.assertTrue(actual['german'] > actual['english'])

    def test_probability_language_detector_detect_incorrect(self):
        language_detector = ProbabilityLanguageDetector((3, 5), 100)
        bad_inputs = [[], {}, '', 1, None, True, (None,)]

        expected = {}
        for bad_input in bad_inputs:
            actual = language_detector.detect_language(bad_input)
            self.assertEqual(expected, actual)

    @patch('lab_3.main.ProbabilityLanguageDetector._calculate_sentence_probability',
           side_effect=ProbabilityLanguageDetector()._calculate_sentence_probability)
    def test_probability_language_detector_calls_required_method(self, mock):
        unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8')
        german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8')
        english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8')

        text_unk = tokenize_by_sentence(unknown_file.read())
        text_ger = tokenize_by_sentence(german_file.read())
        text_eng = tokenize_by_sentence(english_file.read())
        english_file.close()
        german_file.close()
        unknown_file.close()

        letter_storage = LetterStorage()
        letter_storage.update(text_eng)
        letter_storage.update(text_ger)
        letter_storage.update(text_unk)

        eng_encoded = encode_corpus(letter_storage, text_eng)
        unk_encoded = encode_corpus(letter_storage, text_unk)
        ger_encoded = encode_corpus(letter_storage, text_ger)

        language_detector = ProbabilityLanguageDetector((3, 4, 5), 1000)
        language_detector.new_language(eng_encoded, 'english')
        language_detector.new_language(ger_encoded, 'german')

        ngram_unknown = NGramTrie(4)
        ngram_unknown.fill_n_grams(unk_encoded)

        language_detector.detect_language(ngram_unknown.n_grams)
        self.assertTrue(mock.called)
Ejemplo n.º 10
0
    text_unk = tokenize_by_sentence(unknown_file.read())
    text_ger = tokenize_by_sentence(german_file.read())
    text_eng = tokenize_by_sentence(english_file.read())
    english_file.close()
    german_file.close()
    unknown_file.close()

    letter_storage = LetterStorage()
    letter_storage.update(text_eng)
    letter_storage.update(text_ger)
    letter_storage.update(text_unk)

    eng_encoded = encode_corpus(letter_storage, text_eng)
    unk_encoded = encode_corpus(letter_storage, text_unk)
    ger_encoded = encode_corpus(letter_storage, text_ger)

    language_detector = ProbabilityLanguageDetector((3, 4, 5), 1000)
    language_detector.new_language(eng_encoded, 'english')
    language_detector.new_language(ger_encoded, 'german')

    ngram_unknown = NGramTrie(4)
    ngram_unknown.fill_n_grams(unk_encoded)

    actual = language_detector.detect_language(ngram_unknown.n_grams)
    print(actual)

    RESULT = actual['english'] < actual['german']
    # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST
    assert RESULT == 1, ''