def segment_words_dynamically(self, string, cache): """ Segments a sentence into words using a globally optimal real:fake word ratio maximizing algorithm. Uses dynamic programming under the hood, adding to the cache of best segmentation for the given string and all substrings. This method is called recursively. :param string: words without spaces separating them :param cache: cache of best segmentations for the string (or any other strings in the global string) :return: list of words that are a word segmentation of the given string """ if string in cache: # dynamic programming part return cache[string] frequency_whole = self.unigram_provider.get_frequency(string) threshold = self.LETTER_THRESHOLD if len(string) == 1 else self.WORD_THRESHOLD score_whole = self.SCORE_WORD if frequency_whole > threshold else self.PENALTY_NON_WORD if len(string) <= 1: # base case cache[string] = [string], score_whole return cache[string] best_segmentation = [string] best_score = score_whole for i in range(1, len(string)): # recursive case a, b = bisect_string(string, i) segmentation_a, score_a = self.segment_words_dynamically(a, cache) segmentation_b, score_b = self.segment_words_dynamically(b, cache) score = (score_a + score_b) / (len(segmentation_a) + len(segmentation_b)) if score > best_score: best_score = score best_segmentation = segmentation_a + segmentation_b cache[string] = best_segmentation, best_score return cache[string]
def segment_words_dynamically(self, string, cache): """ Segments a sentence into words by optimizing the bigram probability of the sentence. Uses dynamic programming under the hood, adding to the cache of best segmentation for the given string and all substrings. This method is called recursively. :param string: words without spaces separating them :param cache: cache of best segmentations for the string (or any other strings in the global string) :return: list of words that are a word segmentation of the given string """ if string in cache: # dynamic programming part return cache[string] probability_whole = self.unigram_provider.get_frequency(string) / self.total_words if len(string) <= 1: # base case cache[string] = [string], probability_whole return cache[string] best_segmentation = [string] best_score = probability_whole for i in range(1, len(string)): # recursive case a, b = bisect_string(string, i) segmentation_a, score_a = self.segment_words_dynamically(a, cache) segmentation_b, score_b = self.segment_words_dynamically(b, cache) new_bigram = tuple([segmentation_a[-1], segmentation_b[0]]) frequency_first_word = max(self.unigram_provider.get_frequency(segmentation_a[-1]), 1) probability_new_bigram = max(1, self.bigram_provider.get_frequency(new_bigram)) / frequency_first_word score = score_a * score_b * probability_new_bigram if score * len(segmentation_a + segmentation_b) > best_score: best_score = score best_segmentation = segmentation_a + segmentation_b cache[string] = best_segmentation, best_score return cache[string]
def segment_words_dynamically(self, string, cache): """ Segments a sentence into words using by optimizing unigram counts globally as well as favoring long words over short ones. Uses dynamic programming under the hood, adding to the cache of best segmentation for the given string and all substrings. This method is called recursively. :param string: words without spaces separating them :param cache: cache of best segmentations for the string (or any other strings in the global string) :return: list of words that are a word segmentation of the given string """ if string in cache: # dynamic programming part return cache[string] frequency_whole = self.unigram_provider.get_frequency(string) score_whole = log(frequency_whole) if frequency_whole != 0 else 0 if len(string) <= 1: # base case cache[string] = [string], score_whole return cache[string] best_segmentation = [string] best_score = score_whole for i in range(1, len(string)): # recursive case a, b = bisect_string(string, i) segmentation_a, score_a = self.segment_words_dynamically(a, cache) segmentation_b, score_b = self.segment_words_dynamically(b, cache) score = log(exp(score_a) + exp(score_b)) / (len(segmentation_a) + len(segmentation_b)) if score > best_score: best_score = score best_segmentation = segmentation_a + segmentation_b cache[string] = best_segmentation, best_score return cache[string]