def test_evalutate_decoding_complete_eng(self): decoded_text = base_text = 'This is a sample text.' alphabet = textstatistics.get_char_frequencies(base_text) dictionary = textstatistics.get_word_frequencies(base_text) language = textstatistics.Languauge(alphabet, dictionary) result = decode.evalutate_decoding(decoded_text, language) self.assertEqual(result, 1.0)
def test_evalutate_decoding_almost_eng(self): base_text = 'This is a sample text.' decoded_text = 'Thas as i simple text.' alphabet = textstatistics.get_char_frequencies(base_text) dictionary = textstatistics.get_word_frequencies(base_text) language = textstatistics.Languauge(alphabet, dictionary) result = decode.evalutate_decoding(decoded_text, language) self.assertTrue(result < 1.0 and result > 0.5)
def test_evalutate_decoding_subset_eng(self): base_text = 'This is a sample text.' base_text_words = textstatistics.split_to_words(base_text) decoded_text = ' '.join(base_text_words[:len(base_text_words) // 2]) alphabet = textstatistics.get_char_frequencies(base_text) dictionary = textstatistics.get_word_frequencies(base_text) language = textstatistics.Languauge(alphabet, dictionary) result = decode.evalutate_decoding(decoded_text, language) self.assertEqual(result, 1.0)
def test_evalutate_decoding_subset_eng(self): base_text = 'This is a sample text.' base_text_words = textstatistics.split_to_words(base_text) decoded_text = ' '.join(base_text_words[: len(base_text_words) / 2]) alphabet = textstatistics.get_char_frequencies(base_text) dictionary = textstatistics.get_word_frequencies(base_text) language = textstatistics.Languauge(alphabet, dictionary) result = decode.evalutate_decoding(decoded_text, language) self.assertEqual(result, 1.0)
def setUp(self): original_alphabet = list(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя') self.original_text = data.QUOTE_FROM_ILF_AND_PETROV alphabet = textstatistics.get_char_frequencies(self.original_text) alphabet = {char: frequency for (char, frequency) in alphabet.iteritems() if char in original_alphabet} dictionary = textstatistics.get_word_frequencies(self.original_text) self.language = textstatistics.Languauge(alphabet, dictionary) actual_original_alphabet = alphabet.keys() shuffled_alphabet = list(actual_original_alphabet) random.seed(1001) random.shuffle(shuffled_alphabet) self.code = dict(zip(actual_original_alphabet, shuffled_alphabet))
def test_get_word_frequencies_russian(self): text = QUOTE_FROM_TOLSTOY expected = { u'Не': 1, u'слушайте': 1, u'тех': 1, u'кто': 1, u'говорит': 1, u'дурно': 1, u'о': 2, u'других': 1, u'и': 1, u'хорошо': 1, u'вас': 1 } result = textstatistics.get_word_frequencies(text) self.assertDictEqual(result, expected)
def setUp(self): original_alphabet = list(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя') self.original_text = data.QUOTE_FROM_ILF_AND_PETROV alphabet = textstatistics.get_char_frequencies(self.original_text) alphabet = { char: frequency for (char, frequency) in alphabet.items() if char in original_alphabet } dictionary = textstatistics.get_word_frequencies(self.original_text) self.language = textstatistics.Languauge(alphabet, dictionary) actual_original_alphabet = alphabet.keys() shuffled_alphabet = list(actual_original_alphabet) random.seed(1001) random.shuffle(shuffled_alphabet) self.code = dict(zip(actual_original_alphabet, shuffled_alphabet))
def test_get_word_frequencies_english(self): text = QUOTE_FROM_SHAKESPEARE expected = {'To': 1, 'be': 2, 'or': 1, 'not': 1, 'to': 1} result = textstatistics.get_word_frequencies(text) self.assertDictEqual(result, expected)
def test_get_word_frequencies_empty(self): text = '' expected = {} result = textstatistics.get_word_frequencies(text) self.assertEqual(result, expected)