def test_get_char_frequencies_simple_text(self): text = QUOTE_FROM_TOLSTOY result = textstatistics.get_char_frequencies(text) expected = { u'Н': 1, u'е': 3, u' ': 11, u'с': 2, u'л': 1, u'у': 3, u'ш': 2, u'а': 2, u'й': 1, u'т': 4, u'х': 3, u',': 1, u'к': 1, u'о': 9, u'г': 2, u'в': 2, u'р': 4, u'и': 3, u'д': 2, u'н': 1, u'.': 1, } self.assertDictEqual(result, expected)
def test_get_char_frequencies_simple_text(self): text = u'Не слушайте тех, кто говорит дурно о других и хорошо о вас.' result = textstatistics.get_char_frequencies(text) expected = { u'Н': 1, u'е': 3, u' ': 11, u'с': 2, u'л': 1, u'у': 3, u'ш': 2, u'а': 2, u'й': 1, u'т': 4, u'х': 3, u',': 1, u'к': 1, u'о': 9, u'г': 2, u'в': 2, u'р': 4, u'и': 3, u'д': 2, u'н': 1, u'.': 1, } self.assertDictEqual(result, expected)
def test_evalutate_decoding_complete_eng(self): decoded_text = base_text = 'This is a sample text.' alphabet = textstatistics.get_char_frequencies(base_text) dictionary = textstatistics.get_word_frequencies(base_text) language = textstatistics.Languauge(alphabet, dictionary) result = decode.evalutate_decoding(decoded_text, language) self.assertEqual(result, 1.0)
def test_evalutate_decoding_almost_eng(self): base_text = 'This is a sample text.' decoded_text = 'Thas as i simple text.' alphabet = textstatistics.get_char_frequencies(base_text) dictionary = textstatistics.get_word_frequencies(base_text) language = textstatistics.Languauge(alphabet, dictionary) result = decode.evalutate_decoding(decoded_text, language) self.assertTrue(result < 1.0 and result > 0.5)
def test_evalutate_decoding_subset_eng(self): base_text = 'This is a sample text.' base_text_words = textstatistics.split_to_words(base_text) decoded_text = ' '.join(base_text_words[:len(base_text_words) // 2]) alphabet = textstatistics.get_char_frequencies(base_text) dictionary = textstatistics.get_word_frequencies(base_text) language = textstatistics.Languauge(alphabet, dictionary) result = decode.evalutate_decoding(decoded_text, language) self.assertEqual(result, 1.0)
def test_evalutate_decoding_subset_eng(self): base_text = 'This is a sample text.' base_text_words = textstatistics.split_to_words(base_text) decoded_text = ' '.join(base_text_words[: len(base_text_words) / 2]) alphabet = textstatistics.get_char_frequencies(base_text) dictionary = textstatistics.get_word_frequencies(base_text) language = textstatistics.Languauge(alphabet, dictionary) result = decode.evalutate_decoding(decoded_text, language) self.assertEqual(result, 1.0)
def test_my_get_char(self): text = u'hello world!!!' result = textstatistics.get_char_frequencies(text) expected = { u'h': 1, u'e': 1, u'l': 3, u'o': 2, u' ': 1, u'w': 1, u'r': 1, u'd': 1, u'!': 3, } self.assertDictEqual(result, expected)
def setUp(self): original_alphabet = list(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя') self.original_text = data.QUOTE_FROM_ILF_AND_PETROV alphabet = textstatistics.get_char_frequencies(self.original_text) alphabet = {char: frequency for (char, frequency) in alphabet.iteritems() if char in original_alphabet} dictionary = textstatistics.get_word_frequencies(self.original_text) self.language = textstatistics.Languauge(alphabet, dictionary) actual_original_alphabet = alphabet.keys() shuffled_alphabet = list(actual_original_alphabet) random.seed(1001) random.shuffle(shuffled_alphabet) self.code = dict(zip(actual_original_alphabet, shuffled_alphabet))
def setUp(self): original_alphabet = list(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя') self.original_text = data.QUOTE_FROM_ILF_AND_PETROV alphabet = textstatistics.get_char_frequencies(self.original_text) alphabet = { char: frequency for (char, frequency) in alphabet.items() if char in original_alphabet } dictionary = textstatistics.get_word_frequencies(self.original_text) self.language = textstatistics.Languauge(alphabet, dictionary) actual_original_alphabet = alphabet.keys() shuffled_alphabet = list(actual_original_alphabet) random.seed(1001) random.shuffle(shuffled_alphabet) self.code = dict(zip(actual_original_alphabet, shuffled_alphabet))
def test_get_char_frequencies_uniform(self): text = u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя ' result = textstatistics.get_char_frequencies(text) expected = {char: 1 for char in text} self.assertDictEqual(result, expected)
def decode_text(text, language): ''' Decodes the text encoded with a substitution cipher ''' original_alphabet = list(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя') alphabet = language.get_alphabet() alphabet = {char: frequency for (char, frequency) in alphabet.items()} alphabet_encode_text = textstatistics.get_char_frequencies(text) alphabet_encode_text = { char: frequency for (char, frequency) in alphabet_encode_text.items() if char in original_alphabet } dif_dict = { key: 0 for key in alphabet.keys() if key not in alphabet_encode_text.keys() } alphabet_encode_text.update(dif_dict) list_alphabet = list(alphabet.items()) list_alphabet.sort(key=lambda i: i[1]) list_alphabet_encode_text = list(alphabet_encode_text.items()) list_alphabet_encode_text.sort(key=lambda i: i[1]) char_to_char = {} for (item_list_1, item_list_2) in zip(list_alphabet, list_alphabet_encode_text): char_to_char[item_list_2[0]] = item_list_1[0] decoded_text = '' for char in text: if char in original_alphabet: decoded_text += char_to_char[char] else: decoded_text += char word_fitness = evalutate_decoding(decoded_text, language) while word_fitness != 1.0: old_list = list_alphabet.copy() n = 0 while n < 32: n += 1 i = 0 old_list = list_alphabet.copy() while i < len(list_alphabet) - 1: if (i + n) > len(list_alphabet_encode_text) - 1: break list_alphabet[i], list_alphabet[i + n] = list_alphabet[ i + n], list_alphabet[i] char_to_char = {} for (item_list_1, item_list_2) in zip(list_alphabet, list_alphabet_encode_text): char_to_char[item_list_2[0]] = item_list_1[0] decoded_text = '' for char in text: if char in original_alphabet: decoded_text += char_to_char[char] else: decoded_text += char new_word_fitness = evalutate_decoding(decoded_text, language) if new_word_fitness > word_fitness: word_fitness = new_word_fitness old_list = list_alphabet.copy() i += 1 else: list_alphabet = old_list.copy() i += 1 if word_fitness == 1.0: return decoded_text return decoded_text
def test_get_char_frequencies_empty(self): text = u'' result = textstatistics.get_char_frequencies(text) expected = {} self.assertDictEqual(result, expected)