def test_zero_integer(self):
     """Test vose_sampler.ProbDistribution.alias_generation against a size
     defined by zero. """
     words = vose_sampler.get_words(valid_folder + "small.txt")
     word_dist = vose_sampler.sample2dist(words)
     VA_words = vose_sampler.VoseAlias(word_dist)
     self.assertRaisesRegexp(SystemExit, nonnegative_integer_error + "0", VA_words.sample_n, 0)
 def test_negative_integer(self):
     """Test vose_sampler.VoseAlias.alias_generation against a size
     specified by a negative integer. """
     words = vose_sampler.get_words(valid_folder + "small.txt")
     word_dist = vose_sampler.sample2dist(words)
     VA_words = vose_sampler.VoseAlias(word_dist)
     self.assertRaisesRegexp(SystemExit, nonnegative_integer_error + "-1", VA_words.sample_n, -1)
 def test_zero_integer(self):
     """Test vose_sampler.ProbDistribution.alias_generation against a size
     defined by zero. """
     words = vose_sampler.get_words(valid_folder + "small.txt")
     word_dist = vose_sampler.sample2dist(words)
     VA_words = vose_sampler.VoseAlias(word_dist)
     self.assertRaisesRegexp(SystemExit, nonnegative_integer_error + "0",  VA_words.sample_n, 0)
 def test_negative_integer(self):
     """Test vose_sampler.VoseAlias.alias_generation against a size
     specified by a negative integer. """
     words = vose_sampler.get_words(valid_folder + "small.txt")
     word_dist = vose_sampler.sample2dist(words)
     VA_words = vose_sampler.VoseAlias(word_dist)
     self.assertRaisesRegexp(SystemExit, nonnegative_integer_error + "-1",  VA_words.sample_n, -1)
    def test_output_alias_generation(self):
        """Test vose_sampler.ProbDistribution.alias_generation to ensure it
        generates words with same distribution as the original corpus. This
        performs a 2-sided hypothesis test at the 1% significance level, that:
        H_0: observed proportion a randomly selected word is equal to the
             proportion seen in the original corpus (i.e. p_original == p_observed)
        H_1: p_original != p_observed
        """
        print(
            "WARNING: There is a random element to test_output_alias_generation\n\
        so it is likely to occasionally fail, nonetheless if the alias_generation\n\
        method is working correctly failures will be very rare (testing at alpha=0.01\n\
        implies we should expect a Type I error about 1% of the time).")

        # Construct a ProbDistribution
        words = vose_sampler.get_words(valid_folder + "small.txt")
        word_dist = vose_sampler.sample2dist(words)
        VA_words = vose_sampler.VoseAlias(word_dist)

        # Generate sample and calculate the number of observations for a randomly selected word
        word = random.choice(list(VA_words.dist))

        n = 1000

        t = 0
        for i in range(n):
            if VA_words.alias_generation() == word:
                t += 1

        # Compute the p-value
        p_original = VA_words.dist[word]

        p_low = math.fsum(
            [self.dbinom(x, n, p_original) for x in range(t, n + 1)])
        p_high = math.fsum(
            [self.dbinom(x, n, p_original) for x in range(t + 1)])

        p = 2 * min(p_low, p_high)

        # Do not accept H_0 if p <= alpha
        alpha = 0.01
        self.assertGreater(p, alpha)
    def test_output_alias_generation(self):
        """Test vose_sampler.ProbDistribution.alias_generation to ensure it
        generates words with same distribution as the original corpus. This
        performs a 2-sided hypothesis test at the 1% significance level, that:
        H_0: observed proportion a randomly selected word is equal to the
             proportion seen in the original corpus (i.e. p_original == p_observed)
        H_1: p_original != p_observed
        """
        print(
            "WARNING: There is a random element to test_output_alias_generation\n\
        so it is likely to occasionally fail, nonetheless if the alias_generation\n\
        method is working correctly failures will be very rare (testing at alpha=0.01\n\
        implies we should expect a Type I error about 1% of the time)."
        )

        # Construct a ProbDistribution
        words = vose_sampler.get_words(valid_folder + "small.txt")
        word_dist = vose_sampler.sample2dist(words)
        VA_words = vose_sampler.VoseAlias(word_dist)

        # Generate sample and calculate the number of observations for a randomly selected word
        word = random.choice(list(VA_words.dist))

        n = 1000

        t = 0
        for i in range(n):
            if VA_words.alias_generation() == word:
                t += 1

        # Compute the p-value
        p_original = VA_words.dist[word]

        p_low = math.fsum([self.dbinom(x, n, p_original) for x in range(t, n + 1)])
        p_high = math.fsum([self.dbinom(x, n, p_original) for x in range(t + 1)])

        p = 2 * min(p_low, p_high)

        # Do not accept H_0 if p <= alpha
        alpha = 0.01
        self.assertGreater(p, alpha)
 def test_output_get_word(self):
     """Test vose_sampler.get_words to ensure it correctly produces a list of
     words from a given corpus. """
     actual = vose_sampler.get_words(valid_folder + "single_word.txt")
     expected = ["Speechmatics"]
     self.assertEqual(actual, expected)
 def test_output_get_word(self):
     """Test vose_sampler.get_words to ensure it correctly produces a list of
     words from a given corpus. """
     actual = vose_sampler.get_words(valid_folder + "single_word.txt")
     expected = ["Speechmatics"]
     self.assertEqual(actual, expected)