def test_with_larger_values_sc_2(self):
        smoother = SimpleGoodTuringSmoother(K, {
            1: 16181,
            2: 2213,
            3: 870,
            4: 431,
            5: 304,
            6: 202
        }, 2111251811)
        smoother.initialize()

        for i in range(0, K + 5):
            logger.info("c_{} : {}, \t c*_{} : {}".format(
                i, i, i, smoother.smooth(i)))
    def test_with_small_values(self):
        smoother = SimpleGoodTuringSmoother(K, {
            1: 10,
            2: 5,
            3: 3,
            4: 2,
            5: 1,
            6: 0
        }, 100)
        smoother.initialize()

        for i in range(0, K + 5):
            logger.info("c_{} : {}, \t c*_{} : {}".format(
                i, i, i, smoother.smooth(i)))
    def test_with_larger_values(self):
        smoother = SimpleGoodTuringSmoother(K, {
            1: 268,
            2: 112,
            3: 70,
            4: 41,
            5: 24,
            6: 14,
            7: 15,
            400: 1,
            1918: 1
        }, 1000)
        smoother.initialize()

        for i in range(0, K + 5):
            logger.info("c_{} : {}, \t c*_{} : {}".format(
                i, i, i, smoother.smooth(i)))
    def test_with_zero_frequencies_in_between(self):
        smoother = SimpleGoodTuringSmoother(K, {
            1: 268,
            2: 0,
            3: 70,
            4: 0,
            5: 24,
            6: 14,
            7: 15,
            400: 1,
            1918: 1
        }, 1000)
        smoother.initialize()

        for i in range(0, K + 5):
            logger.info("c_{} : {}, \t c*_{} : {}".format(
                i, i, i, smoother.smooth(i)))
class SimpleGoodTuringContextlessDistributionSmoother(
        ContextlessDistributionSmoother):
    AVG_PARSE_RESULTS_FOR_A_WORD = 6  # avg parse result count for a word
    AVG_WORDS_FOR_A_LEXEME = 50  # avg word count for a lexeme
    AVG_WORDS_FOR_A_STEM = 10  # avg word count for a stem

    def __init__(self, smoothing_threshold, unigram_collection):
        self._smoothing_threshold = smoothing_threshold
        self._unigram_collection = unigram_collection

        assert self._smoothing_threshold and self._smoothing_threshold > 1

    def initialize(self):
        logger.debug(
            "Initializing SimpleGoodTuringContextlessDistributionSmoother for K:{}, AVG_PARSE_RESULTS_FOR_A_WORD:{}, AVG_WORDS_FOR_A_LEXEME:{}"
            .format(self._smoothing_threshold,
                    self.AVG_PARSE_RESULTS_FOR_A_WORD,
                    self.AVG_WORDS_FOR_A_LEXEME))

        distinct_parse_result_count = NgramTypeFrequencyFinder.find_distinct_parse_result_count(
            self._unigram_collection)
        distinct_word_count = NgramTypeFrequencyFinder.find_distinct_word_count(
            self._unigram_collection)

        distinct_lexeme_count = NgramTypeFrequencyFinder.find_distinct_count(
            self._unigram_collection, ['lemma_root'])
        distinct_stem_count = NgramTypeFrequencyFinder.find_distinct_count(
            self._unigram_collection, ['stem'])
        possible_word_count_estimate_from_lexemes = distinct_lexeme_count * self.AVG_WORDS_FOR_A_LEXEME
        possible_word_count_estimate_from_stems = distinct_stem_count * self.AVG_WORDS_FOR_A_STEM

        possible_word_count_estimate = possible_word_count_estimate_from_stems + possible_word_count_estimate_from_lexemes
        unseen_word_count = possible_word_count_estimate - distinct_word_count

        possible_parse_result_count_estimate = possible_word_count_estimate * self.AVG_PARSE_RESULTS_FOR_A_WORD
        unseen_parse_result_count = possible_parse_result_count_estimate - distinct_parse_result_count

        logger.debug("Found {} distinct parse results".format(
            distinct_parse_result_count))
        logger.debug("Found {} distinct words".format(distinct_word_count))
        logger.debug("Estimated possible parse result count : {}".format(
            possible_parse_result_count_estimate))
        logger.debug("Estimated unseen parse result count : {}".format(
            unseen_parse_result_count))

        logger.debug("Found {} distinct lexemes".format(distinct_lexeme_count))
        logger.debug("Estimated possible word count from lexemes: {}".format(
            possible_word_count_estimate_from_lexemes))
        logger.debug("Estimated possible word count from stems: {}".format(
            possible_word_count_estimate_from_stems))
        logger.debug("Estimated possible word count: {}".format(
            possible_word_count_estimate))
        logger.debug(
            "Estimated unseen word count : {}".format(unseen_word_count))

        frequencies_of_parse_result_frequencies = {
            1: distinct_parse_result_count
        }
        frequencies_of_word_frequencies = {1: distinct_word_count}

        for i in range(2, self._smoothing_threshold + 2):
            frequencies_of_parse_result_frequencies[
                i] = NgramTypeFrequencyFinder.find_frequency_of_parse_result_frequency(
                    self._unigram_collection, i)
            frequencies_of_word_frequencies[
                i] = NgramTypeFrequencyFinder.find_frequency_of_word_frequency(
                    self._unigram_collection, i)

        logger.debug("Frequencies of parse result frequencies")
        logger.debug(pformat(frequencies_of_parse_result_frequencies))

        logger.debug("Frequencies of word frequencies")
        logger.debug(pformat(frequencies_of_word_frequencies))

        self._parse_result_count_smoother = SimpleGoodTuringSmoother(
            self._smoothing_threshold, frequencies_of_parse_result_frequencies,
            unseen_parse_result_count)

        self._word_count_smoother = SimpleGoodTuringSmoother(
            self._smoothing_threshold, frequencies_of_word_frequencies,
            unseen_word_count)

        self._parse_result_count_smoother.initialize()
        self._word_count_smoother.initialize()

    def smooth_parse_result_occurrence_count(self,
                                             parse_result_occurrence_count):
        if parse_result_occurrence_count > self._smoothing_threshold:
            return parse_result_occurrence_count

        return self._parse_result_count_smoother.smooth(
            parse_result_occurrence_count)

    def smooth_word_occurrence_count(self, word_occurrence_count):
        if word_occurrence_count > self._smoothing_threshold:
            return word_occurrence_count

        return self._word_count_smoother.smooth(word_occurrence_count)
Beispiel #6
0
    def test_with_zero_frequencies_in_between(self):
        smoother = SimpleGoodTuringSmoother(K, {1: 268, 2: 0, 3: 70, 4: 0, 5: 24, 6: 14, 7: 15, 400: 1, 1918: 1}, 1000)
        smoother.initialize()

        for i in range(0, K + 5):
            logger.info("c_{} : {}, \t c*_{} : {}".format(i, i, i, smoother.smooth(i)))
Beispiel #7
0
    def test_with_larger_values_sc_2(self):
        smoother = SimpleGoodTuringSmoother(K, {1: 16181, 2: 2213, 3: 870, 4: 431, 5: 304, 6: 202}, 2111251811)
        smoother.initialize()

        for i in range(0, K + 5):
            logger.info("c_{} : {}, \t c*_{} : {}".format(i, i, i, smoother.smooth(i)))
Beispiel #8
0
    def test_with_larger_values(self):
        smoother = SimpleGoodTuringSmoother(K, {1: 268, 2: 112, 3: 70, 4: 41, 5: 24, 6: 14, 7: 15, 400: 1, 1918: 1}, 1000)
        smoother.initialize()

        for i in range(0, K + 5):
            logger.info("c_{} : {}, \t c*_{} : {}".format(i, i, i, smoother.smooth(i)))
Beispiel #9
0
    def test_with_small_values(self):
        smoother = SimpleGoodTuringSmoother(K, {1: 10, 2: 5, 3: 3, 4: 2, 5: 1, 6: 0}, 100)
        smoother.initialize()

        for i in range(0, K + 5):
            logger.info("c_{} : {}, \t c*_{} : {}".format(i, i, i, smoother.smooth(i)))
class SimpleGoodTuringContextlessDistributionSmoother(ContextlessDistributionSmoother):
    AVG_PARSE_RESULTS_FOR_A_WORD = 6    # avg parse result count for a word
    AVG_WORDS_FOR_A_LEXEME = 50         # avg word count for a lexeme
    AVG_WORDS_FOR_A_STEM = 10           # avg word count for a stem

    def __init__(self, smoothing_threshold, unigram_collection):
        self._smoothing_threshold = smoothing_threshold
        self._unigram_collection = unigram_collection

        assert self._smoothing_threshold and self._smoothing_threshold > 1

    def initialize(self):
        logger.debug(
            "Initializing SimpleGoodTuringContextlessDistributionSmoother for K:{}, AVG_PARSE_RESULTS_FOR_A_WORD:{}, AVG_WORDS_FOR_A_LEXEME:{}".format(
                self._smoothing_threshold, self.AVG_PARSE_RESULTS_FOR_A_WORD, self.AVG_WORDS_FOR_A_LEXEME))

        distinct_parse_result_count = NgramTypeFrequencyFinder.find_distinct_parse_result_count(
            self._unigram_collection)
        distinct_word_count = NgramTypeFrequencyFinder.find_distinct_word_count(self._unigram_collection)

        distinct_lexeme_count = NgramTypeFrequencyFinder.find_distinct_count(self._unigram_collection, ['lemma_root'])
        distinct_stem_count = NgramTypeFrequencyFinder.find_distinct_count(self._unigram_collection, ['stem'])
        possible_word_count_estimate_from_lexemes = distinct_lexeme_count * self.AVG_WORDS_FOR_A_LEXEME
        possible_word_count_estimate_from_stems = distinct_stem_count * self.AVG_WORDS_FOR_A_STEM

        possible_word_count_estimate = possible_word_count_estimate_from_stems + possible_word_count_estimate_from_lexemes
        unseen_word_count = possible_word_count_estimate - distinct_word_count

        possible_parse_result_count_estimate = possible_word_count_estimate * self.AVG_PARSE_RESULTS_FOR_A_WORD
        unseen_parse_result_count = possible_parse_result_count_estimate - distinct_parse_result_count

        logger.debug("Found {} distinct parse results".format(distinct_parse_result_count))
        logger.debug("Found {} distinct words".format(distinct_word_count))
        logger.debug("Estimated possible parse result count : {}".format(possible_parse_result_count_estimate))
        logger.debug("Estimated unseen parse result count : {}".format(unseen_parse_result_count))

        logger.debug("Found {} distinct lexemes".format(distinct_lexeme_count))
        logger.debug("Estimated possible word count from lexemes: {}".format(possible_word_count_estimate_from_lexemes))
        logger.debug("Estimated possible word count from stems: {}".format(possible_word_count_estimate_from_stems))
        logger.debug("Estimated possible word count: {}".format(possible_word_count_estimate))
        logger.debug("Estimated unseen word count : {}".format(unseen_word_count))

        frequencies_of_parse_result_frequencies = {1: distinct_parse_result_count}
        frequencies_of_word_frequencies = {1: distinct_word_count}

        for i in range(2, self._smoothing_threshold + 2):
            frequencies_of_parse_result_frequencies[
            i] = NgramTypeFrequencyFinder.find_frequency_of_parse_result_frequency(self._unigram_collection, i)
            frequencies_of_word_frequencies[i] = NgramTypeFrequencyFinder.find_frequency_of_word_frequency(
                self._unigram_collection, i)

        logger.debug("Frequencies of parse result frequencies")
        logger.debug(pformat(frequencies_of_parse_result_frequencies))

        logger.debug("Frequencies of word frequencies")
        logger.debug(pformat(frequencies_of_word_frequencies))

        self._parse_result_count_smoother = SimpleGoodTuringSmoother(self._smoothing_threshold,
            frequencies_of_parse_result_frequencies,
            unseen_parse_result_count)

        self._word_count_smoother = SimpleGoodTuringSmoother(self._smoothing_threshold, frequencies_of_word_frequencies,
            unseen_word_count)

        self._parse_result_count_smoother.initialize()
        self._word_count_smoother.initialize()

    def smooth_parse_result_occurrence_count(self, parse_result_occurrence_count):
        if parse_result_occurrence_count > self._smoothing_threshold:
            return parse_result_occurrence_count

        return self._parse_result_count_smoother.smooth(parse_result_occurrence_count)


    def smooth_word_occurrence_count(self, word_occurrence_count):
        if word_occurrence_count > self._smoothing_threshold:
            return word_occurrence_count

        return self._word_count_smoother.smooth(word_occurrence_count)