def test_letter_storage_update_calls_required_function(self, mock): """ ideal case for update calling put_letter method """ letter_storage = LetterStorage() sentences = ((('_', 't', 'e', 's', 't', '_'), ), ) letter_storage.update(sentences) self.assertTrue(mock.called)
def test_letter_storage_get_id_by_letter_ideal(self): """ ideal case for get_id_by_letter """ letter_storage = LetterStorage() letter_storage.storage = {'w': 1} expected = 1 actual = letter_storage.get_id_by_letter('w') self.assertEqual(expected, actual)
def test_letter_storage_get_id_by_letter_none(self): """ get_id_by_letter none """ letter_storage = LetterStorage() letter_storage.storage = {'w': 1} expected = -1 actual = letter_storage.get_id_by_letter(None) self.assertEqual(expected, actual)
def test_letter_storage_get_id_by_letter_not_str(self): """ id is not str get_id_by_letter """ letter_storage = LetterStorage() letter_storage.storage = {'w': 1} expected = -1 actual = letter_storage.get_id_by_letter(123) self.assertEqual(expected, actual)
def test_letter_storage_update_none(self): """ ideal case for update """ letter_storage = LetterStorage() sentences = None expected = 1 actual = letter_storage.update(sentences) self.assertEqual(letter_storage.storage, {}) self.assertEqual(expected, actual)
def test_letter_storage_update_ideal(self): """ ideal case for update """ letter_storage = LetterStorage() sentences = ((('_', 't', 'e', 's', 't', '_'), ), ) expected = 0 actual = letter_storage.update(sentences) self.assertEqual(len(letter_storage.storage), 4) self.assertEqual(expected, actual)
def test_letter_storage_put_letter_ideal(self): """ letter is added to storage """ letter_storage = LetterStorage() letter = 'w' expected = 0 actual = letter_storage._put_letter(letter) self.assertTrue(letter in letter_storage.storage) self.assertEqual(expected, actual)
def test_letter_storage_put_letter_not_str(self): """ non string letter is not added to storage """ letter_storage = LetterStorage() letter = 123 expected = 1 actual = letter_storage._put_letter(letter) self.assertEqual(letter_storage.storage, {}) self.assertEqual(expected, actual)
def test_letter_storage_put_letter_existing(self): """ existing letter is not added to storage """ letter_storage = LetterStorage() letter = 'w' letter_storage.storage = {'w': 1} expected = 0 actual = letter_storage._put_letter(letter) self.assertEqual(letter_storage.storage, {'w': 1}) self.assertEqual(expected, actual)
def test_letter_storage_update_not_tuple(self): """ ideal case for update """ letter_storage = LetterStorage() sentences = [(('_', 't', 'e', 's', 't', '_'), ), (('_', 's', 'e', 'c', 'o', 'n', 'd', '_'), )] expected = 1 actual = letter_storage.update(sentences) self.assertEqual(letter_storage.storage, {}) self.assertEqual(expected, actual)
def test_detect_language_calls_required_method(self, mock): letter_storage = LetterStorage() language_detector = LanguageDetector((3, ), 10) text_to_detect = (((1, 2, 3), ), ) file = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8') text = tokenize_by_sentence(file.read()) letter_storage.update(text) encoded_text = encode_corpus(letter_storage, text) file.close() language_detector.new_language(encoded_text, 'english') language_detector.detect_language(text_to_detect) self.assertTrue(mock.called)
def test_new_language_creates_several_ngrams(self): letter_storage = LetterStorage() language_detector = LanguageDetector((2, 3), 10) file = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8') text = tokenize_by_sentence(file.read()) letter_storage.update(text) encoded_text = encode_corpus(letter_storage, text) file.close() language_detector.new_language(encoded_text, 'english') self.assertTrue(language_detector.n_gram_storages['english'][2]) self.assertTrue(language_detector.n_gram_storages['english'][3])
def test_new_language_add_existing_language(self): letter_storage = LetterStorage() language_detector = LanguageDetector((3, ), 10) file = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8') text = tokenize_by_sentence(file.read()) letter_storage.update(text) encoded_text = encode_corpus(letter_storage, text) file.close() expected = 0 language_detector.new_language(encoded_text, 'german') actual = language_detector.new_language(encoded_text, 'german') self.assertEqual(expected, actual)
def test_detect_language_uses_several_ngrams(self): letter_storage = LetterStorage() language_detector = LanguageDetector((2, 3), 100) file_first = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8') file_second = open('lab_3/Thomas_Mann.txt', 'r', encoding='utf-8') file_third = open('lab_3/unknown_Arthur_Conan_Doyle.txt', 'r', encoding='utf-8') text_english = tokenize_by_sentence(file_first.read()) text_german = tokenize_by_sentence(file_second.read()) text_unknown = tokenize_by_sentence(file_third.read()) letter_storage.update(text_english) letter_storage.update(text_german) letter_storage.update(text_unknown) encoded_english = encode_corpus(letter_storage, text_english) encoded_german = encode_corpus(letter_storage, text_german) encoded_unknown = encode_corpus(letter_storage, text_unknown) file_first.close() file_second.close() file_third.close() language_detector.new_language(encoded_english, 'english') language_detector.new_language(encoded_german, 'german') actual = language_detector.detect_language(encoded_unknown) self.assertTrue(actual['german'] > actual['english'])
def test_letter_storage_correct_instance_creation(self): """ letter storage instance creates with correct attributes """ letter_storage = LetterStorage() expected = {} self.assertEqual(letter_storage.storage, expected)
def test_encode_corpus_empty_sentence(self): """ Tests that encode_corpus function can handle empty sentence input """ letter_storage = LetterStorage() sentences = () expected = () actual = encode_corpus(letter_storage, sentences) self.assertEqual(expected, actual)
def test_encode_corpus_calls_require_function(self, mock): """ Tests that encode_corpus function calls required get_id_by_letter function """ letter_storage = LetterStorage() sentences = ((('_', 't', 'e', 's', 't', '_'), ), (('_', 's', 'e', 'c', 'o', 'n', 'd', '_'), )) encode_corpus(letter_storage, sentences) self.assertTrue(mock.called)
def test_encode_corpus_same_characters_count(self): """ Tests that encode_corpus function can assign correct id to the same character """ letter_storage = LetterStorage() sentences = ((('_', 't', 'e', 's', 't', '_'), ), (('_', 't', 'e', 's', 't', '_'), )) actual = encode_corpus(letter_storage, sentences) self.assertEqual(actual[0][0][0], actual[1][0][0])
def test_encode_corpus_inappropriate_sentence(self): """ Tests that encode_corpus function can handle inappropriate sentence inputs """ letter_storage = LetterStorage() bad_inputs = [None, 123, 'test', [], {}] expected = () for bad_input in bad_inputs: actual = encode_corpus(letter_storage, bad_input) self.assertEqual(expected, actual)
def test_new_language_storage_already_created(self): letter_storage = LetterStorage() language_detector = LanguageDetector((3, ), 10) file = open('lab_3/Thomas_Mann.txt', 'r', encoding='utf-8') file_unknown = open('lab_3/unknown_Arthur_Conan_Doyle.txt', 'r', encoding='utf-8') text = tokenize_by_sentence(file.read()) text_unknown = tokenize_by_sentence(file_unknown.read()) letter_storage.update(text) letter_storage.update(text_unknown) encoded_text = encode_corpus(letter_storage, text) encoded_unknown_text = encode_corpus(letter_storage, text_unknown) file.close() file_unknown.close() language_detector.new_language(encoded_text, 'german') language_detector.new_language(encoded_unknown_text, 'english') self.assertTrue(language_detector.n_gram_storages['german']) self.assertTrue(language_detector.n_gram_storages['english']) self.assertEqual(type(language_detector.n_gram_storages['german'][3]), NGramTrie) self.assertEqual(type(language_detector.n_gram_storages['english'][3]), NGramTrie)
def test_probability_language_detector_detect_language_ideal(self): unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8') german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8') english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8') text_unk = tokenize_by_sentence(unknown_file.read()) text_ger = tokenize_by_sentence(german_file.read()) text_eng = tokenize_by_sentence(english_file.read()) english_file.close() german_file.close() unknown_file.close() letter_storage = LetterStorage() letter_storage.update(text_eng) letter_storage.update(text_ger) letter_storage.update(text_unk) eng_encoded = encode_corpus(letter_storage, text_eng) unk_encoded = encode_corpus(letter_storage, text_unk) ger_encoded = encode_corpus(letter_storage, text_ger) language_detector = ProbabilityLanguageDetector((3, 4, 5), 1000) language_detector.new_language(eng_encoded, 'english') language_detector.new_language(ger_encoded, 'german') actual = language_detector.detect_language(unk_encoded) self.assertTrue(actual['german'] > actual['english'])
def test_probability_language_detector_calls_required_method(self, mock): unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8') german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8') english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8') text_unk = tokenize_by_sentence(unknown_file.read()) text_ger = tokenize_by_sentence(german_file.read()) text_eng = tokenize_by_sentence(english_file.read()) english_file.close() german_file.close() unknown_file.close() letter_storage = LetterStorage() letter_storage.update(text_eng) letter_storage.update(text_ger) letter_storage.update(text_unk) eng_encoded = encode_corpus(letter_storage, text_eng) unk_encoded = encode_corpus(letter_storage, text_unk) ger_encoded = encode_corpus(letter_storage, text_ger) language_detector = ProbabilityLanguageDetector((3, 4, 5), 1000) language_detector.new_language(eng_encoded, 'english') language_detector.new_language(ger_encoded, 'german') ngram_unknown = NGramTrie(4) ngram_unknown.fill_n_grams(unk_encoded) language_detector.detect_language(ngram_unknown.n_grams) self.assertTrue(mock.called)
def test_encode_corpus_ideal(self): """ Tests that encode_corpus function generates id for each character """ letter_storage = LetterStorage() sentences = ((('_', 't', 'e', 's', 't', '_'), ), (('_', 's', 'e', 'c', 'o', 'n', 'd', '_'), )) actual = encode_corpus(letter_storage, sentences) for text in actual: for sentence in text: for character in sentence: self.assertTrue(isinstance(character, int))
def test_probability_language_detector_calculate_probability_ideal(self): print('launching test') english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8') german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8') unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8') english_text = tokenize_by_sentence(english_file.read()) german_text = tokenize_by_sentence(german_file.read()) unknown_text = tokenize_by_sentence(unknown_file.read()) english_file.close() german_file.close() unknown_file.close() letter_storage = LetterStorage() letter_storage.update(english_text) letter_storage.update(german_text) letter_storage.update(unknown_text) english_encoded = encode_corpus(letter_storage, english_text) german_encoded = encode_corpus(letter_storage, german_text) unknown_encoded = encode_corpus(letter_storage, unknown_text) language_detector = ProbabilityLanguageDetector((3,), 1000) language_detector.new_language(english_encoded, 'english') language_detector.new_language(german_encoded, 'german') n3_gram_trie_english = language_detector.n_gram_storages['english'][3] n3_gram_trie_german = language_detector.n_gram_storages['german'][3] n3_gram_unknown = NGramTrie(3) n3_gram_unknown.fill_n_grams(unknown_encoded) english_prob = language_detector._calculate_sentence_probability(n3_gram_trie_english, n3_gram_unknown.n_grams) german_prob = language_detector._calculate_sentence_probability(n3_gram_trie_german, n3_gram_unknown.n_grams) print(f'English_sentence_prob: {english_prob}') print(f'Deutsch_sentence_prob: {german_prob}') self.assertTrue(english_prob > german_prob)
def test_probability_language_detector_several_ngrams_case(self): language_detector = ProbabilityLanguageDetector((3, 5), 1000) english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8') german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8') unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8') eng_text = tokenize_by_sentence(english_file.read()) ger_text = tokenize_by_sentence(german_file.read()) unk_text = tokenize_by_sentence(unknown_file.read()) english_file.close() german_file.close() unknown_file.close() letter_storage = LetterStorage() letter_storage.update(eng_text) letter_storage.update(ger_text) letter_storage.update(unk_text) english_encoded = encode_corpus(letter_storage, eng_text) german_encoded = encode_corpus(letter_storage, ger_text) unknown_encoded = encode_corpus(letter_storage, unk_text) language_detector.new_language(english_encoded, 'english') language_detector.new_language(german_encoded, 'german') eng_prob = language_detector.n_gram_storages['english'][5] ger_prob = language_detector.n_gram_storages['german'][5] ngram_trie = NGramTrie(5) ngram_trie.fill_n_grams(unknown_encoded) eng = language_detector._calculate_sentence_probability( eng_prob, ngram_trie.n_grams) ger = language_detector._calculate_sentence_probability( ger_prob, ngram_trie.n_grams) self.assertTrue(ger > eng)
""" Language detector implementation starter """ from lab_3.main import tokenize_by_sentence, LanguageDetector from lab_3.main import encode_corpus from lab_3.main import LetterStorage if __name__ == '__main__': # here goes your function calls letter_storage = LetterStorage() language_detector = LanguageDetector((3, ), 100) file_first = open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8') file_second = open('lab_3/Thomas_Mann.txt', 'r', encoding='utf-8') file_third = open('lab_3/unknown_Arthur_Conan_Doyle.txt', 'r', encoding='utf-8') text_english = tokenize_by_sentence(file_first.read()) text_german = tokenize_by_sentence(file_second.read()) text_unknown = tokenize_by_sentence(file_third.read()) letter_storage.update(text_english) letter_storage.update(text_german) letter_storage.update(text_unknown) encoded_english = encode_corpus(letter_storage, text_english) encoded_german = encode_corpus(letter_storage, text_german) encoded_unknown = encode_corpus(letter_storage, text_unknown) file_first.close() file_second.close()
class LetterStorageTest(unittest.TestCase): """ check LetterStorage class functionality. All tests should pass for score 4 or above """ def test_letter_storage_correct_instance_creation(self): """ letter storage instance creates with correct attributes """ letter_storage = LetterStorage() expected = {} self.assertEqual(letter_storage.storage, expected) # -------------------------------------------------------- def test_letter_storage_put_letter_ideal(self): """ letter is added to storage """ letter_storage = LetterStorage() letter = 'w' expected = 0 actual = letter_storage._put_letter(letter) self.assertTrue(letter in letter_storage.storage) self.assertEqual(expected, actual) def test_letter_storage_put_letter_none(self): """ none is not added to storage """ letter_storage = LetterStorage() letter = None expected = 1 actual = letter_storage._put_letter(letter) self.assertEqual(letter_storage.storage, {}) self.assertEqual(expected, actual) def test_letter_storage_put_letter_not_str(self): """ non string letter is not added to storage """ letter_storage = LetterStorage() letter = 123 expected = 1 actual = letter_storage._put_letter(letter) self.assertEqual(letter_storage.storage, {}) self.assertEqual(expected, actual) def test_letter_storage_put_letter_existing(self): """ existing letter is not added to storage """ letter_storage = LetterStorage() letter = 'w' letter_storage.storage = {'w': 1} expected = 0 actual = letter_storage._put_letter(letter) self.assertEqual(letter_storage.storage, {'w': 1}) self.assertEqual(expected, actual) # ----------------------------------------------------------------- def test_letter_storage_get_id_by_letter_ideal(self): """ ideal case for get_id_by_letter """ letter_storage = LetterStorage() letter_storage.storage = {'w': 1} expected = 1 actual = letter_storage.get_id_by_letter('w') self.assertEqual(expected, actual) def test_letter_storage_get_id_by_letter_none(self): """ get_id_by_letter none """ letter_storage = LetterStorage() letter_storage.storage = {'w': 1} expected = -1 actual = letter_storage.get_id_by_letter(None) self.assertEqual(expected, actual) def test_letter_storage_get_id_by_letter_not_str(self): """ id is not str get_id_by_letter """ letter_storage = LetterStorage() letter_storage.storage = {'w': 1} expected = -1 actual = letter_storage.get_id_by_letter(123) self.assertEqual(expected, actual) def test_letter_storage_get_id_by_letter_not_in_storage(self): """ letter not in storage """ letter_storage = LetterStorage() letter_storage.storage = {'w': 1} expected = -1 actual = letter_storage.get_id_by_letter('a') self.assertEqual(expected, actual) # ----------------------------------------------------------- def test_letter_storage_update_ideal(self): """ ideal case for update """ letter_storage = LetterStorage() sentences = ((('_', 't', 'e', 's', 't', '_'), ), ) expected = 0 actual = letter_storage.update(sentences) self.assertEqual(len(letter_storage.storage), 4) self.assertEqual(expected, actual) def test_letter_storage_update_duplicates(self): """ ideal case for update """ letter_storage = LetterStorage() sentences = ((('_', 't', 'e', 's', 't', '_'), ), (('_', 't', 'e', 's', 't', '_'), )) expected = 0 actual = letter_storage.update(sentences) self.assertEqual(len(letter_storage.storage), 4) self.assertEqual(expected, actual) def test_letter_storage_update_empty(self): """ ideal case for update """ letter_storage = LetterStorage() sentences = () expected = 0 actual = letter_storage.update(sentences) self.assertEqual(letter_storage.storage, {}) self.assertEqual(expected, actual) def test_letter_storage_update_none(self): """ ideal case for update """ letter_storage = LetterStorage() sentences = None expected = 1 actual = letter_storage.update(sentences) self.assertEqual(letter_storage.storage, {}) self.assertEqual(expected, actual) def test_letter_storage_update_not_tuple(self): """ ideal case for update """ letter_storage = LetterStorage() sentences = [(('_', 't', 'e', 's', 't', '_'), ), (('_', 's', 'e', 'c', 'o', 'n', 'd', '_'), )] expected = 1 actual = letter_storage.update(sentences) self.assertEqual(letter_storage.storage, {}) self.assertEqual(expected, actual) @patch('lab_3.main.LetterStorage._put_letter', side_effect=LetterStorage()._put_letter) def test_letter_storage_update_calls_required_function(self, mock): """ ideal case for update calling put_letter method """ letter_storage = LetterStorage() sentences = ((('_', 't', 'e', 's', 't', '_'), ), ) letter_storage.update(sentences) self.assertTrue(mock.called)
from lab_3.main import LetterStorage from lab_3.main import LanguageDetector if __name__ == '__main__': unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8') german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8') english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8') text_unk = tokenize_by_sentence(unknown_file.read()) text_ger = tokenize_by_sentence(german_file.read()) text_eng = tokenize_by_sentence(english_file.read()) english_file.close() german_file.close() unknown_file.close() letter_storage = LetterStorage() letter_storage.update(text_eng) letter_storage.update(text_ger) letter_storage.update(text_unk) eng_encoded = encode_corpus(letter_storage, text_eng) unk_encoded = encode_corpus(letter_storage, text_unk) ger_encoded = encode_corpus(letter_storage, text_ger) language_detector = LanguageDetector((3, 4, 5), 1000) language_detector.new_language(eng_encoded, 'english') language_detector.new_language(ger_encoded, 'german') ngram_unknown = NGramTrie(4) ngram_unknown.fill_n_grams(unk_encoded)
class EncodeCorpusTest(unittest.TestCase): """ checks for encode_corpus function. Score 6 or above function """ def test_encode_corpus_ideal(self): """ Tests that encode_corpus function generates id for each character """ letter_storage = LetterStorage() sentences = ( (('_', 't', 'e', 's', 't', '_'),), (('_', 's', 'e', 'c', 'o', 'n', 'd', '_'),) ) actual = encode_corpus(letter_storage, sentences) for text in actual: for sentence in text: for character in sentence: self.assertTrue(isinstance(character, int)) def test_encode_corpus_same_characters_count(self): """ Tests that encode_corpus function can assign correct id to the same character """ letter_storage = LetterStorage() sentences = ( (('_', 't', 'e', 's', 't', '_'),), (('_', 't', 'e', 's', 't', '_'),) ) actual = encode_corpus(letter_storage, sentences) self.assertEqual(actual[0][0][0], actual[1][0][0]) def test_encode_corpus_inappropriate_sentence(self): """ Tests that encode_corpus function can handle inappropriate sentence inputs """ letter_storage = LetterStorage() bad_inputs = [None, 123, 'test', [], {}] expected = () for bad_input in bad_inputs: actual = encode_corpus(letter_storage, bad_input) self.assertEqual(expected, actual) def test_encode_corpus_inappropriate_storage_instance(self): """ Tests that encode_corpus function can handle inappropriate storage instance inputs """ bad_inputs = [None, 123, 'test', [], {}] sentences = ( (('_', 't', 'e', 's', 't', '_'),), (('_', 's', 'e', 'c', 'o', 'n', 'd', '_'),) ) expected = () for bad_input in bad_inputs: actual = encode_corpus(bad_input, sentences) self.assertEqual(expected, actual) def test_encode_corpus_empty_sentence(self): """ Tests that encode_corpus function can handle empty sentence input """ letter_storage = LetterStorage() sentences = () expected = () actual = encode_corpus(letter_storage, sentences) self.assertEqual(expected, actual) @patch('lab_3.main.LetterStorage.get_id_by_letter', side_effect=LetterStorage().get_id_by_letter) def test_encode_corpus_calls_require_function(self, mock): """ Tests that encode_corpus function calls required get_id_by_letter function """ letter_storage = LetterStorage() sentences = ( (('_', 't', 'e', 's', 't', '_'),), (('_', 's', 'e', 'c', 'o', 'n', 'd', '_'),) ) encode_corpus(letter_storage, sentences) self.assertTrue(mock.called)