def test_zero_integer(self): """Test vose_sampler.ProbDistribution.alias_generation against a size defined by zero. """ words = vose_sampler.get_words(valid_folder + "small.txt") word_dist = vose_sampler.sample2dist(words) VA_words = vose_sampler.VoseAlias(word_dist) self.assertRaisesRegexp(SystemExit, nonnegative_integer_error + "0", VA_words.sample_n, 0)
def test_negative_integer(self): """Test vose_sampler.VoseAlias.alias_generation against a size specified by a negative integer. """ words = vose_sampler.get_words(valid_folder + "small.txt") word_dist = vose_sampler.sample2dist(words) VA_words = vose_sampler.VoseAlias(word_dist) self.assertRaisesRegexp(SystemExit, nonnegative_integer_error + "-1", VA_words.sample_n, -1)
def test_output_alias_generation(self): """Test vose_sampler.ProbDistribution.alias_generation to ensure it generates words with same distribution as the original corpus. This performs a 2-sided hypothesis test at the 1% significance level, that: H_0: observed proportion a randomly selected word is equal to the proportion seen in the original corpus (i.e. p_original == p_observed) H_1: p_original != p_observed """ print( "WARNING: There is a random element to test_output_alias_generation\n\ so it is likely to occasionally fail, nonetheless if the alias_generation\n\ method is working correctly failures will be very rare (testing at alpha=0.01\n\ implies we should expect a Type I error about 1% of the time).") # Construct a ProbDistribution words = vose_sampler.get_words(valid_folder + "small.txt") word_dist = vose_sampler.sample2dist(words) VA_words = vose_sampler.VoseAlias(word_dist) # Generate sample and calculate the number of observations for a randomly selected word word = random.choice(list(VA_words.dist)) n = 1000 t = 0 for i in range(n): if VA_words.alias_generation() == word: t += 1 # Compute the p-value p_original = VA_words.dist[word] p_low = math.fsum( [self.dbinom(x, n, p_original) for x in range(t, n + 1)]) p_high = math.fsum( [self.dbinom(x, n, p_original) for x in range(t + 1)]) p = 2 * min(p_low, p_high) # Do not accept H_0 if p <= alpha alpha = 0.01 self.assertGreater(p, alpha)
def test_output_alias_generation(self): """Test vose_sampler.ProbDistribution.alias_generation to ensure it generates words with same distribution as the original corpus. This performs a 2-sided hypothesis test at the 1% significance level, that: H_0: observed proportion a randomly selected word is equal to the proportion seen in the original corpus (i.e. p_original == p_observed) H_1: p_original != p_observed """ print( "WARNING: There is a random element to test_output_alias_generation\n\ so it is likely to occasionally fail, nonetheless if the alias_generation\n\ method is working correctly failures will be very rare (testing at alpha=0.01\n\ implies we should expect a Type I error about 1% of the time)." ) # Construct a ProbDistribution words = vose_sampler.get_words(valid_folder + "small.txt") word_dist = vose_sampler.sample2dist(words) VA_words = vose_sampler.VoseAlias(word_dist) # Generate sample and calculate the number of observations for a randomly selected word word = random.choice(list(VA_words.dist)) n = 1000 t = 0 for i in range(n): if VA_words.alias_generation() == word: t += 1 # Compute the p-value p_original = VA_words.dist[word] p_low = math.fsum([self.dbinom(x, n, p_original) for x in range(t, n + 1)]) p_high = math.fsum([self.dbinom(x, n, p_original) for x in range(t + 1)]) p = 2 * min(p_low, p_high) # Do not accept H_0 if p <= alpha alpha = 0.01 self.assertGreater(p, alpha)
def test_output_get_word(self): """Test vose_sampler.get_words to ensure it correctly produces a list of words from a given corpus. """ actual = vose_sampler.get_words(valid_folder + "single_word.txt") expected = ["Speechmatics"] self.assertEqual(actual, expected)