def initialize(self): logger.debug( "Initializing SimpleGoodTuringContextlessDistributionSmoother for K:{}, AVG_PARSE_RESULTS_FOR_A_WORD:{}, AVG_WORDS_FOR_A_LEXEME:{}".format( self._smoothing_threshold, self.AVG_PARSE_RESULTS_FOR_A_WORD, self.AVG_WORDS_FOR_A_LEXEME)) distinct_parse_result_count = NgramTypeFrequencyFinder.find_distinct_parse_result_count( self._unigram_collection) distinct_word_count = NgramTypeFrequencyFinder.find_distinct_word_count(self._unigram_collection) distinct_lexeme_count = NgramTypeFrequencyFinder.find_distinct_count(self._unigram_collection, ['lemma_root']) distinct_stem_count = NgramTypeFrequencyFinder.find_distinct_count(self._unigram_collection, ['stem']) possible_word_count_estimate_from_lexemes = distinct_lexeme_count * self.AVG_WORDS_FOR_A_LEXEME possible_word_count_estimate_from_stems = distinct_stem_count * self.AVG_WORDS_FOR_A_STEM possible_word_count_estimate = possible_word_count_estimate_from_stems + possible_word_count_estimate_from_lexemes unseen_word_count = possible_word_count_estimate - distinct_word_count possible_parse_result_count_estimate = possible_word_count_estimate * self.AVG_PARSE_RESULTS_FOR_A_WORD unseen_parse_result_count = possible_parse_result_count_estimate - distinct_parse_result_count logger.debug("Found {} distinct parse results".format(distinct_parse_result_count)) logger.debug("Found {} distinct words".format(distinct_word_count)) logger.debug("Estimated possible parse result count : {}".format(possible_parse_result_count_estimate)) logger.debug("Estimated unseen parse result count : {}".format(unseen_parse_result_count)) logger.debug("Found {} distinct lexemes".format(distinct_lexeme_count)) logger.debug("Estimated possible word count from lexemes: {}".format(possible_word_count_estimate_from_lexemes)) logger.debug("Estimated possible word count from stems: {}".format(possible_word_count_estimate_from_stems)) logger.debug("Estimated possible word count: {}".format(possible_word_count_estimate)) logger.debug("Estimated unseen word count : {}".format(unseen_word_count)) frequencies_of_parse_result_frequencies = {1: distinct_parse_result_count} frequencies_of_word_frequencies = {1: distinct_word_count} for i in range(2, self._smoothing_threshold + 2): frequencies_of_parse_result_frequencies[ i] = NgramTypeFrequencyFinder.find_frequency_of_parse_result_frequency(self._unigram_collection, i) frequencies_of_word_frequencies[i] = NgramTypeFrequencyFinder.find_frequency_of_word_frequency( self._unigram_collection, i) logger.debug("Frequencies of parse result frequencies") logger.debug(pformat(frequencies_of_parse_result_frequencies)) logger.debug("Frequencies of word frequencies") logger.debug(pformat(frequencies_of_word_frequencies)) self._parse_result_count_smoother = SimpleGoodTuringSmoother(self._smoothing_threshold, frequencies_of_parse_result_frequencies, unseen_parse_result_count) self._word_count_smoother = SimpleGoodTuringSmoother(self._smoothing_threshold, frequencies_of_word_frequencies, unseen_word_count) self._parse_result_count_smoother.initialize() self._word_count_smoother.initialize()
def initialize(self): logger.debug( "Initializing SimpleGoodTuringContextlessDistributionSmoother for K:{}, AVG_PARSE_RESULTS_FOR_A_WORD:{}, AVG_WORDS_FOR_A_LEXEME:{}" .format(self._smoothing_threshold, self.AVG_PARSE_RESULTS_FOR_A_WORD, self.AVG_WORDS_FOR_A_LEXEME)) distinct_parse_result_count = NgramTypeFrequencyFinder.find_distinct_parse_result_count( self._unigram_collection) distinct_word_count = NgramTypeFrequencyFinder.find_distinct_word_count( self._unigram_collection) distinct_lexeme_count = NgramTypeFrequencyFinder.find_distinct_count( self._unigram_collection, ['lemma_root']) distinct_stem_count = NgramTypeFrequencyFinder.find_distinct_count( self._unigram_collection, ['stem']) possible_word_count_estimate_from_lexemes = distinct_lexeme_count * self.AVG_WORDS_FOR_A_LEXEME possible_word_count_estimate_from_stems = distinct_stem_count * self.AVG_WORDS_FOR_A_STEM possible_word_count_estimate = possible_word_count_estimate_from_stems + possible_word_count_estimate_from_lexemes unseen_word_count = possible_word_count_estimate - distinct_word_count possible_parse_result_count_estimate = possible_word_count_estimate * self.AVG_PARSE_RESULTS_FOR_A_WORD unseen_parse_result_count = possible_parse_result_count_estimate - distinct_parse_result_count logger.debug("Found {} distinct parse results".format( distinct_parse_result_count)) logger.debug("Found {} distinct words".format(distinct_word_count)) logger.debug("Estimated possible parse result count : {}".format( possible_parse_result_count_estimate)) logger.debug("Estimated unseen parse result count : {}".format( unseen_parse_result_count)) logger.debug("Found {} distinct lexemes".format(distinct_lexeme_count)) logger.debug("Estimated possible word count from lexemes: {}".format( possible_word_count_estimate_from_lexemes)) logger.debug("Estimated possible word count from stems: {}".format( possible_word_count_estimate_from_stems)) logger.debug("Estimated possible word count: {}".format( possible_word_count_estimate)) logger.debug( "Estimated unseen word count : {}".format(unseen_word_count)) frequencies_of_parse_result_frequencies = { 1: distinct_parse_result_count } frequencies_of_word_frequencies = {1: distinct_word_count} for i in range(2, self._smoothing_threshold + 2): frequencies_of_parse_result_frequencies[ i] = NgramTypeFrequencyFinder.find_frequency_of_parse_result_frequency( self._unigram_collection, i) frequencies_of_word_frequencies[ i] = NgramTypeFrequencyFinder.find_frequency_of_word_frequency( self._unigram_collection, i) logger.debug("Frequencies of parse result frequencies") logger.debug(pformat(frequencies_of_parse_result_frequencies)) logger.debug("Frequencies of word frequencies") logger.debug(pformat(frequencies_of_word_frequencies)) self._parse_result_count_smoother = SimpleGoodTuringSmoother( self._smoothing_threshold, frequencies_of_parse_result_frequencies, unseen_parse_result_count) self._word_count_smoother = SimpleGoodTuringSmoother( self._smoothing_threshold, frequencies_of_word_frequencies, unseen_word_count) self._parse_result_count_smoother.initialize() self._word_count_smoother.initialize()