def find_updated_terms(self, text, updatedText): textList = helpers.parse_words(text) updatedTextList = helpers.parse_words(updatedText) modifications = dict() for word, updatedWord in zip(textList, updatedTextList): if word != updatedWord: modifications[word] = updatedWord return modifications
def lookup_compound(self, phrase, max_edit_distance, ignore_non_words=False): """lookup_compound supports compound aware automatic spelling correction of multi-word input strings with three cases: 1. mistakenly inserted space into a correct word led to two incorrect terms 2. mistakenly omitted space between two correct words led to one incorrect combined term 3. multiple independent input terms with/without spelling errors Find suggested spellings for a multi-word input string (supports word splitting/merging). Keyword arguments: phrase -- The string being spell checked. max_edit_distance -- The maximum edit distance between input and suggested words. Return: A List of SuggestItem object representing suggested correct spellings for the input string. """ # Parse input string into single terms term_list_1 = helpers.parse_words(phrase) # Second list of single terms with preserved cases so we can ignore # acronyms (all cap words) if ignore_non_words: term_list_2 = helpers.parse_words(phrase, True) suggestions = list() suggestion_parts = list() distance_comparer = EditDistance(self._distance_algorithm) # translate every item to its best suggestion, otherwise it remains # unchanged is_last_combi = False for i, __ in enumerate(term_list_1): if ignore_non_words: if helpers.try_parse_int64(term_list_1[i]) is not None: suggestion_parts.append(SuggestItem(term_list_1[i], 0, 0)) continue # if re.match(r"\b[A-Z]{2,}\b", term_list_2[i]): if helpers.is_acronym(term_list_2[i]): suggestion_parts.append(SuggestItem(term_list_2[i], 0, 0)) continue suggestions = self.lookup(term_list_1[i], Verbosity.TOP, max_edit_distance) # combi check, always before split if i > 0 and not is_last_combi: suggestions_combi = self.lookup( term_list_1[i - 1] + term_list_1[i], Verbosity.TOP, max_edit_distance) if suggestions_combi: best_1 = suggestion_parts[-1] if suggestions: best_2 = suggestions[0] else: best_2 = SuggestItem(term_list_1[i], max_edit_distance + 1, 0) # make sure we're comparing with the lowercase form of the # previous word distance_1 = distance_comparer.compare( term_list_1[i - 1] + " " + term_list_1[i], best_1.term.lower() + " " + best_2.term, max_edit_distance) if (distance_1 >= 0 and suggestions_combi[0].distance + 1 < distance_1): suggestions_combi[0].distance += 1 suggestion_parts[-1] = suggestions_combi[0] is_last_combi = True continue is_last_combi = False # alway split terms without suggestion / never split terms with # suggestion ed=0 / never split single char terms if (suggestions and (suggestions[0].distance == 0 or len(term_list_1[i]) == 1)): # choose best suggestion suggestion_parts.append(suggestions[0]) else: # if no perfect suggestion, split word into pairs suggestions_split = list() # add original term if suggestions: suggestions_split.append(suggestions[0]) if len(term_list_1[i]) > 1: for j in range(1, len(term_list_1[i])): part_1 = term_list_1[i][:j] part_2 = term_list_1[i][j:] suggestions_1 = self.lookup(part_1, Verbosity.TOP, max_edit_distance) if suggestions_1: # if split correction1 == einzelwort correction if (suggestions and suggestions[0].term == suggestions_1[0].term): break suggestions_2 = self.lookup( part_2, Verbosity.TOP, max_edit_distance) if suggestions_2: # if split correction1 == einzelwort correction if (suggestions and suggestions[0].term == suggestions_2[0].term): break # select best suggestion for split pair tmp_term = (suggestions_1[0].term + " " + suggestions_2[0].term) tmp_distance = distance_comparer.compare( term_list_1[i], tmp_term, max_edit_distance) if tmp_distance < 0: tmp_distance = max_edit_distance + 1 tmp_count = min(suggestions_1[0].count, suggestions_2[0].count) suggestion_split = SuggestItem( tmp_term, tmp_distance, tmp_count) suggestions_split.append(suggestion_split) # early termination of split if suggestion_split.distance == 1: break if suggestions_split: # select best suggestion for split pair suggestions_split.sort() suggestion_parts.append(suggestions_split[0]) else: si = SuggestItem(term_list_1[i], max_edit_distance + 1, 0) suggestion_parts.append(si) else: si = SuggestItem(term_list_1[i], max_edit_distance + 1, 0) suggestion_parts.append(si) joined_term = "" joined_count = sys.maxsize for si in suggestion_parts: joined_term += si.term + " " joined_count = min(joined_count, si.count) suggestion = SuggestItem( joined_term.rstrip(), distance_comparer.compare(phrase, joined_term, 2**31 - 1), joined_count) suggestions_line = list() suggestions_line.append(suggestion) return suggestions_line
def lookup_compound(self, phrase, max_edit_distance, ignore_non_words=False, transfer_casing=False): """`lookup_compound` supports compound aware automatic spelling correction of multi-word input strings with three cases: 1. mistakenly inserted space into a correct word led to two incorrect terms 2. mistakenly omitted space between two correct words led to one incorrect combined term 3. multiple independent input terms with/without spelling errors Find suggested spellings for a multi-word input string (supports word splitting/merging). **Args**: * phrase (str): The string being spell checked. * max_edit_distance (int): The maximum edit distance between\ input and suggested words. * transfer_casing (bool): A flag to determine whether the casing (eg upper- vs lowercase) should be carried over\ from the phrase **Returns**: A list of :class:`SuggestItem` object representing suggested\ correct spellings for the input string. """ # Parse input string into single terms term_list_1 = helpers.parse_words(phrase) # Second list of single terms with preserved cases so we can # ignore acronyms (all cap words) if ignore_non_words: term_list_2 = helpers.parse_words(phrase, True) suggestions = list() suggestion_parts = list() distance_comparer = EditDistance(self._distance_algorithm) # translate every item to its best suggestion, otherwise it # remains unchanged is_last_combi = False for i, __ in enumerate(term_list_1): if ignore_non_words: if helpers.try_parse_int64(term_list_1[i]) is not None: suggestion_parts.append(SuggestItem(term_list_1[i], 0, 0)) continue if helpers.is_acronym(term_list_2[i]): suggestion_parts.append(SuggestItem(term_list_2[i], 0, 0)) continue suggestions = self.lookup(term_list_1[i], Verbosity.TOP, max_edit_distance) # combi check, always before split if i > 0 and not is_last_combi: suggestions_combi = self.lookup( term_list_1[i - 1] + term_list_1[i], Verbosity.TOP, max_edit_distance) if suggestions_combi: best_1 = suggestion_parts[-1] if suggestions: best_2 = suggestions[0] else: # estimated word occurrence probability # P=10 / (N * 10^word length l) best_2 = SuggestItem(term_list_1[i], max_edit_distance + 1, 10 // 10**len(term_list_1[i])) # distance_1=edit distance between 2 split terms and # their best corrections : als comparative value # for the combination distance_1 = best_1.distance + best_2.distance if (distance_1 >= 0 and (suggestions_combi[0].distance + 1 < distance_1 or (suggestions_combi[0].distance + 1 == distance_1 and (suggestions_combi[0].count > best_1.count / self.N * best_2.count)))): suggestions_combi[0].distance += 1 suggestion_parts[-1] = suggestions_combi[0] is_last_combi = True continue is_last_combi = False # alway split terms without suggestion / never split terms # with suggestion ed=0 / never split single char terms if suggestions and (suggestions[0].distance == 0 or len(term_list_1[i]) == 1): # choose best suggestion suggestion_parts.append(suggestions[0]) else: # if no perfect suggestion, split word into pairs suggestion_split_best = None # add original term if suggestions: suggestion_split_best = suggestions[0] if len(term_list_1[i]) > 1: for j in range(1, len(term_list_1[i])): part_1 = term_list_1[i][:j] part_2 = term_list_1[i][j:] suggestions_1 = self.lookup(part_1, Verbosity.TOP, max_edit_distance) if suggestions_1: suggestions_2 = self.lookup( part_2, Verbosity.TOP, max_edit_distance) if suggestions_2: # select best suggestion for split pair tmp_term = (suggestions_1[0].term + " " + suggestions_2[0].term) tmp_distance = distance_comparer.compare( term_list_1[i], tmp_term, max_edit_distance) if tmp_distance < 0: tmp_distance = max_edit_distance + 1 if suggestion_split_best is not None: if tmp_distance > suggestion_split_best.distance: continue if tmp_distance < suggestion_split_best.distance: suggestion_split_best = None if tmp_term in self._bigrams: tmp_count = self._bigrams[tmp_term] # increase count, if split # corrections are part of or # identical to input single term # correction exists if suggestions: best_si = suggestions[0] # alternatively remove the # single term from # suggestion_split, but then # other splittings could win if suggestions_1[ 0].term + suggestions_2[ 0].term == term_list_1[i]: # make count bigger than # count of single term # correction tmp_count = max( tmp_count, best_si.count + 2) elif (suggestions_1[0].term == best_si.term or suggestions_2[0].term == best_si.term): # make count bigger than # count of single term # correction tmp_count = max( tmp_count, best_si.count + 1) # no single term correction exists elif suggestions_1[0].term + suggestions_2[ 0].term == term_list_1[i]: tmp_count = max( tmp_count, max(suggestions_1[0].count, suggestions_2[0].count) + 2) else: # The Naive Bayes probability of # the word combination is the # product of the two word # probabilities: P(AB)=P(A)*P(B) # use it to estimate the frequency # count of the combination, which # then is used to rank/select the # best splitting variant tmp_count = min( self.bigram_count_min, int(suggestions_1[0].count / self.N * suggestions_2[0].count)) suggestion_split = SuggestItem( tmp_term, tmp_distance, tmp_count) if (suggestion_split_best is None or suggestion_split.count > suggestion_split_best.count): suggestion_split_best = suggestion_split if suggestion_split_best is not None: # select best suggestion for split pair suggestion_parts.append(suggestion_split_best) self._replaced_words[ term_list_1[i]] = suggestion_split_best else: si = SuggestItem(term_list_1[i], max_edit_distance + 1, int(10 / 10**len(term_list_1[i]))) suggestion_parts.append(si) self._replaced_words[term_list_1[i]] = si else: # estimated word occurrence probability # P=10 / (N * 10^word length l) si = SuggestItem(term_list_1[i], max_edit_distance + 1, int(10 / 10**len(term_list_1[i]))) suggestion_parts.append(si) self._replaced_words[term_list_1[i]] = si joined_term = "" joined_count = self.N for si in suggestion_parts: joined_term += si.term + " " joined_count *= si.count / self.N joined_term = joined_term.rstrip() if transfer_casing: joined_term = helpers.transfer_casing_for_similar_text( phrase, joined_term) suggestion = SuggestItem( joined_term, distance_comparer.compare(phrase, joined_term, 2**31 - 1), int(joined_count)) suggestions_line = list() suggestions_line.append(suggestion) return suggestions_line