def initialize(self):
        self._vocabulary_sizes_for_ngram_item_types = self._find_vocabulary_sizes(self._ngram_item_types)
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug("Found vocabulary sizes for ngram types : " + str(self._vocabulary_sizes_for_ngram_item_types))

        for context_type in self._ngram_item_types:
            for context_is_leading in (True, False):
                for target_type in self._ngram_item_types:
                    ngram_type, type_key = self._get_ngram_type_and_key(context_is_leading, context_type, target_type)

                    if type_key in self._smoothers_for_ngram_types:
                        # stuff already calculated, smoother already created!
                        continue

                    distinct_ngram_count_for_ngram_type = NgramTypeFrequencyFinder.find_distinct_count(
                        self._collection, ngram_type
                    )
                    possible_ngram_count_for_ngram_type = reduce(
                        operator.mul,
                        [
                            self._vocabulary_sizes_for_ngram_item_types[ngram_type_item]
                            for ngram_type_item in ngram_type
                        ],
                    )
                    frequency_of_frequency_0 = possible_ngram_count_for_ngram_type - distinct_ngram_count_for_ngram_type
                    logger.debug("  Distinct ngram count for ngram type = " + str(distinct_ngram_count_for_ngram_type))
                    logger.debug("  Possible ngram count for ngram type = " + str(possible_ngram_count_for_ngram_type))
                    logger.debug("  Frequency of frequency 0 (unseen) = " + str(frequency_of_frequency_0))

                    frequencies_of_frequencies_for_ngram_type = {}
                    for i in range(1, self._smoothing_threshold + 2):
                        frequencies_of_frequencies_for_ngram_type[i] = self._find_frequency_of_frequency(ngram_type, i)

                    smoother = SimpleGoodTuringSmoother(
                        self._smoothing_threshold, frequencies_of_frequencies_for_ngram_type, frequency_of_frequency_0
                    )
                    self._smoothers_for_ngram_types[type_key] = smoother

        for ngram_type_key, smoother in self._smoothers_for_ngram_types.iteritems():
            smoother.initialize(PLOTTING_MODE)

        if logger.isEnabledFor(logging.DEBUG):
            for ngram_type_key, smoother in self._smoothers_for_ngram_types.iteritems():
                # convert default dict to normal dict and then pprint it
                logger.debug(
                    "Found frequencies of ngram frequencies for {}: "
                    + pprint.pformat(json.loads(json.dumps(smoother._frequencies_of_frequencies)))
                )
                logger.debug("Found unseen for {}: {}".format(smoother._unseen_count))

        if logger.isEnabledFor(logging.DEBUG):
            for ngram_type_key, smoother in self._smoothers_for_ngram_types.iteritems():
                # convert default dict to normal dict and then pprint it
                logger.debug(
                    "Loglin regression coefficient m for {}: ".format(ngram_type_key, smoother._loglinregression_m)
                )
                logger.debug(
                    "Loglin regression coefficient c for {}: ".format(ngram_type_key, smoother._loglinregression_c)
                )
 def _find_frequency_of_frequency(self, ngram_type, frequency):
     assert frequency > 0 and ngram_type
     logger.debug(" Finding freq of freq for freq={}, ngram_type={}".format(frequency, ngram_type))
     frequency_from_database = NgramTypeFrequencyFinder.find_frequency_of_frequency(
         self._collection, ngram_type, frequency
     )
     logger.debug("  Frequency of frequency = " + str(frequency_from_database))
     return frequency_from_database
 def _find_frequency_of_frequency(self, ngram_type, frequency):
     assert frequency > 0 and ngram_type
     logger.debug(" Finding freq of freq for freq={}, ngram_type={}".format(
         frequency, ngram_type))
     frequency_from_database = NgramTypeFrequencyFinder.find_frequency_of_frequency(
         self._collection, ngram_type, frequency)
     logger.debug("  Frequency of frequency = " +
                  str(frequency_from_database))
     return frequency_from_database
    def _find_vocabulary_sizes(self, ngram_item_types):
        """
        @return: Vocabulary sizes, which is number of distinct unigrams for surfaces, stems and lexemes
        @rtype: list
        """
        vocabulary_sizes_for_types = {}
        for ngram_item_type in ngram_item_types:
            vocabulary_size_for_type = NgramTypeFrequencyFinder.find_distinct_count(
                self._unigram_collection, [ngram_item_type]
            )
            vocabulary_sizes_for_types[ngram_item_type] = vocabulary_size_for_type

        return vocabulary_sizes_for_types
    def _find_vocabulary_sizes(self, ngram_item_types):
        """
        @return: Vocabulary sizes, which is number of distinct unigrams for surfaces, stems and lexemes
        @rtype: list
        """
        vocabulary_sizes_for_types = {}
        for ngram_item_type in ngram_item_types:
            vocabulary_size_for_type = NgramTypeFrequencyFinder.find_distinct_count(
                self._unigram_collection, [ngram_item_type])
            vocabulary_sizes_for_types[
                ngram_item_type] = vocabulary_size_for_type

        return vocabulary_sizes_for_types
    def __init__(self, ngram_length, smoothing_threshold, collection,
                 unigram_collection):
        super(SimpleGoodTuringNGramFrequencySmoother, self).__init__()

        self._ngram_length = ngram_length
        self._smoothing_threshold = smoothing_threshold
        self._collection = collection
        self._unigram_collection = unigram_collection
        self._ngram_type_frequency_finder = NgramTypeFrequencyFinder()

        assert ngram_length >= 2
        assert smoothing_threshold > 1

        self._ngram_item_types = ['surface', 'stem', 'lemma_root']

        self._smoothers_for_ngram_types = {}
    def initialize(self):
        logger.debug(
            "Initializing SimpleGoodTuringContextlessDistributionSmoother for K:{}, AVG_PARSE_RESULTS_FOR_A_WORD:{}, AVG_WORDS_FOR_A_LEXEME:{}".format(
                self._smoothing_threshold, self.AVG_PARSE_RESULTS_FOR_A_WORD, self.AVG_WORDS_FOR_A_LEXEME))

        distinct_parse_result_count = NgramTypeFrequencyFinder.find_distinct_parse_result_count(
            self._unigram_collection)
        distinct_word_count = NgramTypeFrequencyFinder.find_distinct_word_count(self._unigram_collection)

        distinct_lexeme_count = NgramTypeFrequencyFinder.find_distinct_count(self._unigram_collection, ['lemma_root'])
        distinct_stem_count = NgramTypeFrequencyFinder.find_distinct_count(self._unigram_collection, ['stem'])
        possible_word_count_estimate_from_lexemes = distinct_lexeme_count * self.AVG_WORDS_FOR_A_LEXEME
        possible_word_count_estimate_from_stems = distinct_stem_count * self.AVG_WORDS_FOR_A_STEM

        possible_word_count_estimate = possible_word_count_estimate_from_stems + possible_word_count_estimate_from_lexemes
        unseen_word_count = possible_word_count_estimate - distinct_word_count

        possible_parse_result_count_estimate = possible_word_count_estimate * self.AVG_PARSE_RESULTS_FOR_A_WORD
        unseen_parse_result_count = possible_parse_result_count_estimate - distinct_parse_result_count

        logger.debug("Found {} distinct parse results".format(distinct_parse_result_count))
        logger.debug("Found {} distinct words".format(distinct_word_count))
        logger.debug("Estimated possible parse result count : {}".format(possible_parse_result_count_estimate))
        logger.debug("Estimated unseen parse result count : {}".format(unseen_parse_result_count))

        logger.debug("Found {} distinct lexemes".format(distinct_lexeme_count))
        logger.debug("Estimated possible word count from lexemes: {}".format(possible_word_count_estimate_from_lexemes))
        logger.debug("Estimated possible word count from stems: {}".format(possible_word_count_estimate_from_stems))
        logger.debug("Estimated possible word count: {}".format(possible_word_count_estimate))
        logger.debug("Estimated unseen word count : {}".format(unseen_word_count))

        frequencies_of_parse_result_frequencies = {1: distinct_parse_result_count}
        frequencies_of_word_frequencies = {1: distinct_word_count}

        for i in range(2, self._smoothing_threshold + 2):
            frequencies_of_parse_result_frequencies[
            i] = NgramTypeFrequencyFinder.find_frequency_of_parse_result_frequency(self._unigram_collection, i)
            frequencies_of_word_frequencies[i] = NgramTypeFrequencyFinder.find_frequency_of_word_frequency(
                self._unigram_collection, i)

        logger.debug("Frequencies of parse result frequencies")
        logger.debug(pformat(frequencies_of_parse_result_frequencies))

        logger.debug("Frequencies of word frequencies")
        logger.debug(pformat(frequencies_of_word_frequencies))

        self._parse_result_count_smoother = SimpleGoodTuringSmoother(self._smoothing_threshold,
            frequencies_of_parse_result_frequencies,
            unseen_parse_result_count)

        self._word_count_smoother = SimpleGoodTuringSmoother(self._smoothing_threshold, frequencies_of_word_frequencies,
            unseen_word_count)

        self._parse_result_count_smoother.initialize()
        self._word_count_smoother.initialize()
    def initialize(self):
        self._vocabulary_sizes_for_ngram_item_types = self._find_vocabulary_sizes(
            self._ngram_item_types)
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug("Found vocabulary sizes for ngram types : " +
                         str(self._vocabulary_sizes_for_ngram_item_types))

        for context_type in self._ngram_item_types:
            for context_is_leading in (True, False):
                for target_type in self._ngram_item_types:
                    ngram_type, type_key = self._get_ngram_type_and_key(
                        context_is_leading, context_type, target_type)

                    if type_key in self._smoothers_for_ngram_types:
                        # stuff already calculated, smoother already created!
                        continue

                    distinct_ngram_count_for_ngram_type = NgramTypeFrequencyFinder.find_distinct_count(
                        self._collection, ngram_type)
                    possible_ngram_count_for_ngram_type = reduce(
                        operator.mul, [
                            self._vocabulary_sizes_for_ngram_item_types[
                                ngram_type_item]
                            for ngram_type_item in ngram_type
                        ])
                    frequency_of_frequency_0 = possible_ngram_count_for_ngram_type - distinct_ngram_count_for_ngram_type
                    logger.debug("  Distinct ngram count for ngram type = " +
                                 str(distinct_ngram_count_for_ngram_type))
                    logger.debug("  Possible ngram count for ngram type = " +
                                 str(possible_ngram_count_for_ngram_type))
                    logger.debug("  Frequency of frequency 0 (unseen) = " +
                                 str(frequency_of_frequency_0))

                    frequencies_of_frequencies_for_ngram_type = {}
                    for i in range(1, self._smoothing_threshold + 2):
                        frequencies_of_frequencies_for_ngram_type[
                            i] = self._find_frequency_of_frequency(
                                ngram_type, i)

                    smoother = SimpleGoodTuringSmoother(
                        self._smoothing_threshold,
                        frequencies_of_frequencies_for_ngram_type,
                        frequency_of_frequency_0)
                    self._smoothers_for_ngram_types[type_key] = smoother

        for ngram_type_key, smoother in self._smoothers_for_ngram_types.iteritems(
        ):
            smoother.initialize(PLOTTING_MODE)

        if logger.isEnabledFor(logging.DEBUG):
            for ngram_type_key, smoother in self._smoothers_for_ngram_types.iteritems(
            ):
                # convert default dict to normal dict and then pprint it
                logger.debug(
                    "Found frequencies of ngram frequencies for {}: " +
                    pprint.pformat(
                        json.loads(
                            json.dumps(smoother._frequencies_of_frequencies))))
                logger.debug("Found unseen for {}: {}".format(
                    smoother._unseen_count))

        if logger.isEnabledFor(logging.DEBUG):
            for ngram_type_key, smoother in self._smoothers_for_ngram_types.iteritems(
            ):
                # convert default dict to normal dict and then pprint it
                logger.debug("Loglin regression coefficient m for {}: ".format(
                    ngram_type_key, smoother._loglinregression_m))
                logger.debug("Loglin regression coefficient c for {}: ".format(
                    ngram_type_key, smoother._loglinregression_c))
    def initialize(self):
        logger.debug(
            "Initializing SimpleGoodTuringContextlessDistributionSmoother for K:{}, AVG_PARSE_RESULTS_FOR_A_WORD:{}, AVG_WORDS_FOR_A_LEXEME:{}"
            .format(self._smoothing_threshold,
                    self.AVG_PARSE_RESULTS_FOR_A_WORD,
                    self.AVG_WORDS_FOR_A_LEXEME))

        distinct_parse_result_count = NgramTypeFrequencyFinder.find_distinct_parse_result_count(
            self._unigram_collection)
        distinct_word_count = NgramTypeFrequencyFinder.find_distinct_word_count(
            self._unigram_collection)

        distinct_lexeme_count = NgramTypeFrequencyFinder.find_distinct_count(
            self._unigram_collection, ['lemma_root'])
        distinct_stem_count = NgramTypeFrequencyFinder.find_distinct_count(
            self._unigram_collection, ['stem'])
        possible_word_count_estimate_from_lexemes = distinct_lexeme_count * self.AVG_WORDS_FOR_A_LEXEME
        possible_word_count_estimate_from_stems = distinct_stem_count * self.AVG_WORDS_FOR_A_STEM

        possible_word_count_estimate = possible_word_count_estimate_from_stems + possible_word_count_estimate_from_lexemes
        unseen_word_count = possible_word_count_estimate - distinct_word_count

        possible_parse_result_count_estimate = possible_word_count_estimate * self.AVG_PARSE_RESULTS_FOR_A_WORD
        unseen_parse_result_count = possible_parse_result_count_estimate - distinct_parse_result_count

        logger.debug("Found {} distinct parse results".format(
            distinct_parse_result_count))
        logger.debug("Found {} distinct words".format(distinct_word_count))
        logger.debug("Estimated possible parse result count : {}".format(
            possible_parse_result_count_estimate))
        logger.debug("Estimated unseen parse result count : {}".format(
            unseen_parse_result_count))

        logger.debug("Found {} distinct lexemes".format(distinct_lexeme_count))
        logger.debug("Estimated possible word count from lexemes: {}".format(
            possible_word_count_estimate_from_lexemes))
        logger.debug("Estimated possible word count from stems: {}".format(
            possible_word_count_estimate_from_stems))
        logger.debug("Estimated possible word count: {}".format(
            possible_word_count_estimate))
        logger.debug(
            "Estimated unseen word count : {}".format(unseen_word_count))

        frequencies_of_parse_result_frequencies = {
            1: distinct_parse_result_count
        }
        frequencies_of_word_frequencies = {1: distinct_word_count}

        for i in range(2, self._smoothing_threshold + 2):
            frequencies_of_parse_result_frequencies[
                i] = NgramTypeFrequencyFinder.find_frequency_of_parse_result_frequency(
                    self._unigram_collection, i)
            frequencies_of_word_frequencies[
                i] = NgramTypeFrequencyFinder.find_frequency_of_word_frequency(
                    self._unigram_collection, i)

        logger.debug("Frequencies of parse result frequencies")
        logger.debug(pformat(frequencies_of_parse_result_frequencies))

        logger.debug("Frequencies of word frequencies")
        logger.debug(pformat(frequencies_of_word_frequencies))

        self._parse_result_count_smoother = SimpleGoodTuringSmoother(
            self._smoothing_threshold, frequencies_of_parse_result_frequencies,
            unseen_parse_result_count)

        self._word_count_smoother = SimpleGoodTuringSmoother(
            self._smoothing_threshold, frequencies_of_word_frequencies,
            unseen_word_count)

        self._parse_result_count_smoother.initialize()
        self._word_count_smoother.initialize()