def compare_unigram_probabilities_intersection(self) -> None: diff = unigram_probabilities_difference( unigram_probabilities(gen_sentences( os.path.join("test_data", "ladygaga.txt")), lower=True), unigram_probabilities(gen_sentences( os.path.join("test_data", "KingJames.txt")), lower=True), intersection_only=True, ) self.assertAlmostEqual(0.01179526361706696, diff["#"]) self.assertAlmostEqual(0.011647311103199175, diff["the"]) self.assertAlmostEqual(0.005001338537094464, diff["love"])
def compare_unigram_probabilities(self) -> None: diff = unigram_probabilities_difference( unigram_probabilities(gen_sentences( os.path.join("test_data", "ladygaga.txt")), lower=True), unigram_probabilities(gen_sentences( os.path.join("test_data", "KingJames.txt")), lower=True), intersection_only=False, ) self.assertAlmostEqual(0.01179526361706696, diff["#"]) self.assertAlmostEqual(0.011647311103199175, diff["the"]) self.assertAlmostEqual(0.008741258741258742, diff["chromatica"])
def compare_bigram_probabilities(self) -> None: bigram_diff = bigram_probabilities_difference( bigram_probabilities(gen_sentences( os.path.join("test_data", "ladygaga.txt")), lower=True), bigram_probabilities(gen_sentences( os.path.join("test_data", "lizzo.txt")), lower=True), "i", intersection_only=False, ) self.assertAlmostEqual(0.08695652173913043, bigram_diff["want"]) self.assertAlmostEqual(0.043478260869565216, bigram_diff["wish"])
def compare_bigram_probabilities_intersection(self) -> None: bigram_diff = bigram_probabilities_difference( bigram_probabilities(gen_sentences( os.path.join("test_data", "lizzo.txt")), lower=True), bigram_probabilities(gen_sentences( os.path.join("test_data", "ladygaga.txt")), lower=True), "i", intersection_only=True, ) self.assertAlmostEqual(0.014624505928853754, bigram_diff["hope"]) self.assertAlmostEqual(0.011462450592885365, bigram_diff["love"])
def explore_mini_corpus(): # Use this space to explore and compare the documents in the mini-corpus provided # Brown samples by genre news_sents = list( gen_sentences(os.path.join("test_data", "brown-news.txt"))) humor_sents = list( gen_sentences(os.path.join("test_data", "brown-humor.txt"))) sci_fi_sents = list( gen_sentences(os.path.join("test_data", "brown-science_fiction.txt"))) romance_sents = list( gen_sentences(os.path.join("test_data", "brown-romance.txt"))) # Tweets ariana_sents = list( gen_sentences(os.path.join("test_data", "ArianaGrande.txt"))) cristiano_sents = list( gen_sentences(os.path.join("test_data", "Cristiano.txt"))) kingjames_sents = list( gen_sentences(os.path.join("test_data", "KingJames.txt"))) gaga_sents = list(gen_sentences(os.path.join("test_data", "ladygaga.txt"))) lizzo_sents = list(gen_sentences(os.path.join("test_data", "lizzo.txt"))) # Examples: print("Lady Gaga") for word, count in Counter( unigram_probabilities(gaga_sents)).most_common(10): print(f"{word}: {count:.3f}") print() print("\nLebron James") for word, count in Counter( unigram_probabilities(kingjames_sents)).most_common(10): print(f"{word}: {count:.3f}") print() print("Difference between Lebron and Gaga unigram probabilities") diff = unigram_probabilities_difference( unigram_probabilities(gaga_sents, lower=True), unigram_probabilities(kingjames_sents, lower=True), intersection_only=False, ) for word, val in Counter(diff).most_common(20): print(f"{word}: {val}") print() print( 'Difference between probabilities of word following "I" for Gaga and Lizzo' ) bigram_diff = bigram_probabilities_difference( bigram_probabilities(lizzo_sents, lower=True), bigram_probabilities(gaga_sents, lower=True), "i", intersection_only=True, ) for word, val in Counter(bigram_diff).most_common(20): print(f"{word}: {val}")
def test_all(self) -> None: """Test all of gen_sentences.""" # Test type gen = gen_sentences( os.path.join("test_data", "hw1_tokenized_text_1.txt")) self.assertEqual(GeneratorType, type(gen)) # Test basic gen = gen_sentences( os.path.join("test_data", "hw1_tokenized_text_1.txt")) self.assertEqual( ["Tokenized", "text", "is", "easy", "to", "work", "with", "."], next(gen)) self.assertEqual(["Writing", "a", "tokenizer", "is", "a", "pain", "."], next(gen)) with self.assertRaises(StopIteration): next(gen) # Test advanced gen = gen_sentences( os.path.join("test_data", "hw1_tokenized_text_2.txt")) self.assertEqual(["Hello", ",", "world", "!"], next(gen)) # Between these sentences, there is a line in the file with a single space, # which should be skipped over. self.assertEqual(["This", "is", "a", "normal", "sentence", "."], next(gen)) self.assertEqual( [ '"', "I", "don't", "like", "it", "when", "there's", "too", "much", "punctuation", "!", '"', ",", "they", "exclaimed", ".", ], next(gen), ) with self.assertRaises(StopIteration): next(gen)
def test_count_unigrams_type(self) -> None: """Test count unigrams type""" gen = gen_sentences( os.path.join("test_data", "hw1_tokenized_text_1.txt")) counts = count_unigrams(gen) for k in counts: self.assertEqual(str, type(k))
def test_type_unigram(self) -> None: """Test that a generator is returned.""" gen = gen_unigrams( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_1.txt"))) self.assertEqual(GeneratorType, type(gen)) self.assertEqual(str, type(next(gen)))
def test_trigram_probabilities_type(self) -> None: probs = trigram_probabilities( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_3.txt")), lower=True, ) self.assertEqual(defaultdict, type(probs))
def test_bigram_frequency_dist(self) -> None: dist = bigram_freq_dist( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_3.txt"))) self.assertEqual(2, dist["The"]["cat"]) self.assertEqual(3, dist["The"]["dog"]) self.assertEqual(1, dist["the"]["cat"]) self.assertEqual(7, dist["."][END_TOKEN0])
def test_trigram_frequency_dist(self) -> None: dist = trigram_freq_dist( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_3.txt"))) self.assertEqual(2, dist[("The", "dog")]["drank"]) self.assertEqual(1, dist[("squirrel", "ate")]["peanuts"]) self.assertEqual(7, dist[(".", END_TOKEN0)][END_TOKEN1]) self.assertEqual(7, dist[(START_TOKEN1, START_TOKEN0)]["The"]) self.assertEqual(3, sum(dist[("The", "dog")].values()))
def test_count_trigrams_type(self) -> None: """Test trigrams are tuples with 3 strings""" # assert case_sarcastically("hello, friend!") == "hElLo, FrIeNd!" gen = gen_sentences( os.path.join("test_data", "hw1_tokenized_text_1.txt")) counts = count_trigrams(gen) for k in counts: self.assertEqual(tuple, type(k)) self.assertEqual(3, len(k))
def test_count_trigrams_lower(self) -> None: """Test count trigrams with lowercasing.""" trigrams = count_trigrams( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_3.txt")), lower=True, ) self.assertEqual(0, trigrams[(START_TOKEN1, START_TOKEN0, "The")]) self.assertEqual(7, trigrams[(START_TOKEN1, START_TOKEN0, "the")])
def test_count_trigrams(self) -> None: """Test count trigrams with casing.""" trigrams = count_trigrams( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_3.txt"))) self.assertEqual(2, trigrams[("The", "dog", "drank")]) self.assertEqual(1, trigrams[("squirrel", "ate", "peanuts")]) self.assertEqual(7, trigrams[(".", END_TOKEN0, END_TOKEN1)]) self.assertEqual(7, trigrams[(START_TOKEN1, START_TOKEN0, "The")])
def test_unigram_probabilities(self) -> None: probs = unigram_probabilities( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_3.txt")), lower=True, ) self.assertAlmostEqual(0.26315789, probs["the"]) self.assertAlmostEqual(0.07894736, probs["dog"]) self.assertAlmostEqual(0.02631578, probs["pizza"]) self.assertEqual(0, probs["cookies"])
def test_trigram_frequency_dist_lower(self) -> None: dist = trigram_freq_dist( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_3.txt")), lower=True, ) self.assertEqual(0, dist[("The", "cat")]["ate"]) self.assertEqual(2, dist[("the", "dog")]["drank"]) self.assertEqual(1, dist[("the", "cat")]["ate"]) self.assertEqual(3, sum(dist[("the", "cat")].values()))
def test_count_unigrams_lower(self) -> None: """Test count unigrams with lowercase option=True""" unigrams = count_unigrams( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_3.txt")), lower=True, ) self.assertEqual(10, unigrams["the"]) self.assertEqual(3, unigrams["cat"]) self.assertEqual(3, unigrams["dog"]) self.assertEqual(7, unigrams["."]) self.assertEqual(1, unigrams["pizza"])
def test_count_bigrams(self) -> None: """Test count bigrams with case""" bigrams = count_bigrams( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_3.txt"))) self.assertEqual(3, bigrams[("The", "dog")]) self.assertEqual(2, bigrams[("squirrel", "ate")]) self.assertEqual(2, bigrams[("The", "cat")]) self.assertEqual(1, bigrams[("the", "cat")]) self.assertEqual(1, bigrams[("drank", "coffee")]) self.assertEqual(7, bigrams[".", END_TOKEN0])
def test_trigram_probabilities(self) -> None: probs = trigram_probabilities( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_3.txt")), lower=True, ) self.assertAlmostEqual(0.5, probs[("dog", "drank")]["coffee"]) self.assertAlmostEqual( 0.4285714285, probs[(START_TOKEN0, "the")]["dog"], ) self.assertAlmostEqual(0.5, probs[("squirrel", "ate")]["peanuts"]) self.assertEqual(0, probs[("cookies", "are")]["good"])
def test_bigram_probabilities(self) -> None: probs = bigram_probabilities( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_3.txt")), lower=True, ) self.assertAlmostEqual(0.3, probs["the"]["dog"]) self.assertEqual( 1, probs[START_TOKEN0]["the"], ) self.assertEqual(1, probs["squirrel"]["ate"]) self.assertEqual(0, probs["cookies"]["are"])
def test_count_unigrams(self) -> None: """Test count unigrams with case""" unigrams = count_unigrams( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_3.txt"))) self.assertEqual(7, unigrams["The"]) self.assertEqual(3, unigrams["dog"]) self.assertEqual( 3, unigrams["cat"], ) self.assertEqual(3, unigrams["the"]) self.assertEqual(7, unigrams["."]) self.assertEqual(1, unigrams["pizza"])
def test_type_trigrams(self) -> None: gen = gen_trigrams( gen_sentences(os.path.join("test_data", "hw1_tokenized_text_1.txt"))) self.assertEqual(GeneratorType, type(gen)) self.assertEqual(tuple, type(next(gen)))
def debug_functions(): # Prints out small output for each function in hw1 # You can modify this to debug your functions # Generate sentences by loading from test data # Store in a list since will use multiple times news_sents = list( gen_sentences(os.path.join("test_data", "brown-news.txt"))) # Generate unigrams, bigrams, trigrams -------------------------------------------------------- print("----------N-gram Generators----------") unigrams = list(gen_unigrams(news_sents)) bigrams = list(gen_bigrams(news_sents)) trigrams = list(gen_trigrams(news_sents)) print(f"Unigrams:\n {unigrams[:8]}\n") print(f"Bigrams:\n {bigrams[:8]}\n") print(f"Trigrams:\n {trigrams[:8]}\n") print() # Counts -------------------------------------------------------------------------------------- print("----------Counts----------") print("--Unigram Counts--") unigram_counts = count_unigrams(news_sents) for gram in list(unigram_counts)[:3]: print(f"{gram}: {unigram_counts[gram]}") print() print("--Bigram Counts--") bigram_counts = count_bigrams(news_sents) for gram in list(bigram_counts)[:3]: print(f"{gram}: {bigram_counts[gram]}") print() print("--Trigram Counts--") trigram_counts = count_trigrams(news_sents) for gram in list(trigram_counts)[:3]: print(f"{gram}: {trigram_counts[gram]}") print() # Frequency Distributions --------------------------------------------------------------------- print("----------Frequency Distributions ----------") print("Bigram Frequency Distribution") bigram_freq = bigram_freq_dist(news_sents) for word1 in list(bigram_freq)[:2]: print(f"\t{word1}: ") for word, count in Counter(bigram_freq[word1]).most_common(3): print(f"\t\t{word}: {count}") print() print("Trigram Frequency Distribution") trigram_freq = trigram_freq_dist(news_sents) for bigram in list(trigram_freq)[:2]: print(f"\t{bigram}: ") for word, count in Counter(trigram_freq[bigram]).most_common(3): print(f"\t\t{word}: {count}") print() # Probabilities ------------------------------------------------------------------------------- print("----------Probabilities----------") print("Unigram probabilities") unigram_probs = unigram_probabilities(news_sents) for word in list(unigram_probs)[:3]: print(f"\t{word}: {unigram_probs[word]:.3f}") print() print("Bigram probabilities") bigram_probs = bigram_probabilities(news_sents) for word1 in list(bigram_probs)[:3]: print(f"\t{word1}:") for word2, prob in Counter(bigram_probs[word1]).most_common(3): print(f"\t\t{word2}: {prob:.5f}") print() print("Trigram probabilities") trigram_probs = trigram_probabilities(news_sents) for bigram in list(trigram_probs)[:3]: print(f"\t{bigram}:") for word, prob in Counter(trigram_probs[bigram]).most_common(3): print(f"\t\t{word}: {prob:.3f}") print()