def test_calculate_frequencies_ideal(self): """ Ideal calculate frequencies scenario """ expected = {'weather': 1, 'sunny': 1, 'man': 1, 'happy': 1} actual = calculate_frequencies(['weather', 'sunny', 'man', 'happy']) self.assertEqual(expected, actual)
def test_calculate_frequencies_complex(self): """ Calculate frequencies with several same tokens """ expected = {'weather': 2, 'sunny': 1, 'man': 2, 'happy': 1} actual = calculate_frequencies( ['weather', 'sunny', 'man', 'happy', 'weather', 'man']) self.assertEqual(expected, actual)
def test_calculate_frequencies_bad_input(self): """ Calculate frequencies invalid input tokens check """ bad_inputs = ['string', {}, (), None, 9, 9.34, True, [None]] expected = {} for bad_input in bad_inputs: actual = calculate_frequencies(bad_input) self.assertEqual(expected, actual)
def test_calculate_frequencies_return_value(self): """ Calculate frequencies return values check """ tokens = ['token1', 'token2'] expected = 2 actual = calculate_frequencies(tokens) self.assertEqual(expected, len(actual)) for token in tokens: self.assertTrue(actual[token]) self.assertTrue(isinstance(actual[tokens[0]], int))
current_dir = os.path.dirname(os.path.abspath(__file__)) data = main.read_from_file(os.path.join(current_dir, 'data.txt')) stop_words = main.read_from_file( os.path.join(current_dir, 'stop_words.txt')).split('\n') # here goes your logic: calling methods from concordance.py tokens = main.tokenize(data) print('tokens:', tokens[:10]) print('\n-----------------------------\n') tokens = main.remove_stop_words(tokens, stop_words) # old: 34 sec, new - 3.4 sec print('tokens without stop words:', tokens[:10]) print('\n-----------------------------\n') frequencies = main.calculate_frequencies( tokens) # old: 116 sec, new: ~81 sec print('frequency for the first word:', frequencies[tokens[0]]) print('\n-----------------------------\n') top_10 = main.get_top_n_words(frequencies, 10) print('top 10 words:', top_10) print('\n-----------------------------\n') concordance_cat = main.get_concordance(tokens, 'cat', 2, 3) print('concordance for "cat", left = 2, right = 3:', concordance_cat[:5]) print('\n-----------------------------\n') adjacent_words_cat = main.get_adjacent_words(tokens, 'cat', 2, 3) print('adjacent words for "cat" left = 2, right = 3:', adjacent_words_cat[:5]) print('\n-----------------------------\n')
import os import main if __name__ == '__main__': # use data.txt file to test your program current_dir = os.path.dirname(os.path.abspath(__file__)) data = main.read_from_file(os.path.join(current_dir, 'data.txt')) stop_words = main.read_from_file( os.path.join(current_dir, 'stop_words.txt')).split('\n') # here goes your logic: calling methods from concordance.py tokenized_data = main.tokenize(data) clean_data = main.remove_stop_words(tokenized_data, stop_words) top_n = main.get_top_n_words(main.calculate_frequencies(clean_data), 13) key_word = top_n[-1] print(f'13th popular word: {key_word}. Let`s use if for further functions') closest_words = main.get_adjacent_words(clean_data, key_word, 3, 2) if len(closest_words) > 0: print( f"\nThird words from the left and second words from the right for " f"the word '{key_word}' (first 5 cases) are") for adjacent_words in closest_words[:5]: print('\t', adjacent_words) concordances = main.get_concordance(clean_data, key_word, 2, 2) if len(concordances) > 0: print( f"\nThe first three concordances (with 2 word on the left and 2 on the right)"
import main if __name__ == '__main__': current_dir = os.path.dirname(os.path.abspath(__file__)) data = main.read_from_file(os.path.join(current_dir, 'data.txt')) stop_words = main.read_from_file( os.path.join(current_dir, 'stop_words.txt')) tokens = main.tokenize(data) print(f'Raw text: {data[:5]}') print(f'Tokenized text: {tokens[:5]}') tokens = main.remove_stop_words(tokens, stop_words) print(f'Text without stop-words: {tokens[:5]}') frequencies = main.calculate_frequencies(tokens[:5000]) print(f'Frequencies: {frequencies[tokens[0]]}') word = 'dog' concordance = main.get_concordance(tokens, word, 2, 0) print(f'The concordance for {word}: {concordance[:5]}') adjacent = main.get_adjacent_words(tokens, 'dog', 2, 0) print(f'Adjacent words: {adjacent[:5]}') sorted_concordance = main.sort_concordance(tokens, 'dog', 2, 0, True) print(f'Sorted concordance: {sorted_concordance[:5]}') main.write_to_file('', sorted_concordance) RESULT = sorted_concordance