Python parse_words Exemples, symspellpy.helpers.parse_words Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : symspellpy.py Projet : sagarailani/symspellpy

 def find_updated_terms(self, text, updatedText):
     textList = helpers.parse_words(text)
     updatedTextList = helpers.parse_words(updatedText)
     modifications = dict()
     for word, updatedWord in zip(textList, updatedTextList):
         if word != updatedWord:
             modifications[word] = updatedWord
     return modifications

Exemple #2

0

Afficher le fichier

Fichier : symspellpy.py Projet : sagarailani/symspellpy

    def lookup_compound(self,
                        phrase,
                        max_edit_distance,
                        ignore_non_words=False):
        """lookup_compound supports compound aware automatic spelling
        correction of multi-word input strings with three cases:
        1. mistakenly inserted space into a correct word led to two incorrect
           terms
        2. mistakenly omitted space between two correct words led to one
           incorrect combined term
        3. multiple independent input terms with/without spelling errors

        Find suggested spellings for a multi-word input string (supports word
        splitting/merging).

        Keyword arguments:
        phrase -- The string being spell checked.
        max_edit_distance -- The maximum edit distance between input and
            suggested words.

        Return:
        A List of SuggestItem object representing suggested correct spellings
        for the input string.
        """
        # Parse input string into single terms
        term_list_1 = helpers.parse_words(phrase)
        # Second list of single terms with preserved cases so we can ignore
        # acronyms (all cap words)
        if ignore_non_words:
            term_list_2 = helpers.parse_words(phrase, True)
        suggestions = list()
        suggestion_parts = list()
        distance_comparer = EditDistance(self._distance_algorithm)

        # translate every item to its best suggestion, otherwise it remains
        # unchanged
        is_last_combi = False
        for i, __ in enumerate(term_list_1):
            if ignore_non_words:
                if helpers.try_parse_int64(term_list_1[i]) is not None:
                    suggestion_parts.append(SuggestItem(term_list_1[i], 0, 0))
                    continue
                # if re.match(r"\b[A-Z]{2,}\b", term_list_2[i]):
                if helpers.is_acronym(term_list_2[i]):
                    suggestion_parts.append(SuggestItem(term_list_2[i], 0, 0))
                    continue
            suggestions = self.lookup(term_list_1[i], Verbosity.TOP,
                                      max_edit_distance)
            # combi check, always before split
            if i > 0 and not is_last_combi:
                suggestions_combi = self.lookup(
                    term_list_1[i - 1] + term_list_1[i], Verbosity.TOP,
                    max_edit_distance)
                if suggestions_combi:
                    best_1 = suggestion_parts[-1]
                    if suggestions:
                        best_2 = suggestions[0]
                    else:
                        best_2 = SuggestItem(term_list_1[i],
                                             max_edit_distance + 1, 0)
                    # make sure we're comparing with the lowercase form of the
                    # previous word
                    distance_1 = distance_comparer.compare(
                        term_list_1[i - 1] + " " + term_list_1[i],
                        best_1.term.lower() + " " + best_2.term,
                        max_edit_distance)
                    if (distance_1 >= 0 and
                            suggestions_combi[0].distance + 1 < distance_1):
                        suggestions_combi[0].distance += 1
                        suggestion_parts[-1] = suggestions_combi[0]
                        is_last_combi = True
                        continue
            is_last_combi = False

            # alway split terms without suggestion / never split terms with
            # suggestion ed=0 / never split single char terms
            if (suggestions and
                (suggestions[0].distance == 0 or len(term_list_1[i]) == 1)):
                # choose best suggestion
                suggestion_parts.append(suggestions[0])
            else:
                # if no perfect suggestion, split word into pairs
                suggestions_split = list()
                # add original term
                if suggestions:
                    suggestions_split.append(suggestions[0])
                if len(term_list_1[i]) > 1:
                    for j in range(1, len(term_list_1[i])):
                        part_1 = term_list_1[i][:j]
                        part_2 = term_list_1[i][j:]
                        suggestions_1 = self.lookup(part_1, Verbosity.TOP,
                                                    max_edit_distance)
                        if suggestions_1:
                            # if split correction1 == einzelwort correction
                            if (suggestions and suggestions[0].term
                                    == suggestions_1[0].term):
                                break
                            suggestions_2 = self.lookup(
                                part_2, Verbosity.TOP, max_edit_distance)
                            if suggestions_2:
                                # if split correction1 == einzelwort correction
                                if (suggestions and suggestions[0].term
                                        == suggestions_2[0].term):
                                    break
                                # select best suggestion for split pair
                                tmp_term = (suggestions_1[0].term + " " +
                                            suggestions_2[0].term)
                                tmp_distance = distance_comparer.compare(
                                    term_list_1[i], tmp_term,
                                    max_edit_distance)
                                if tmp_distance < 0:
                                    tmp_distance = max_edit_distance + 1
                                tmp_count = min(suggestions_1[0].count,
                                                suggestions_2[0].count)
                                suggestion_split = SuggestItem(
                                    tmp_term, tmp_distance, tmp_count)
                                suggestions_split.append(suggestion_split)
                                # early termination of split
                                if suggestion_split.distance == 1:
                                    break

                    if suggestions_split:
                        # select best suggestion for split pair
                        suggestions_split.sort()
                        suggestion_parts.append(suggestions_split[0])
                    else:
                        si = SuggestItem(term_list_1[i], max_edit_distance + 1,
                                         0)
                        suggestion_parts.append(si)
                else:
                    si = SuggestItem(term_list_1[i], max_edit_distance + 1, 0)
                    suggestion_parts.append(si)
        joined_term = ""
        joined_count = sys.maxsize
        for si in suggestion_parts:
            joined_term += si.term + " "
            joined_count = min(joined_count, si.count)
        suggestion = SuggestItem(
            joined_term.rstrip(),
            distance_comparer.compare(phrase, joined_term, 2**31 - 1),
            joined_count)
        suggestions_line = list()
        suggestions_line.append(suggestion)
        return suggestions_line

Exemple #3

0

Afficher le fichier

Fichier : symspellpy.py Projet : sumukh-vasisht/7th-Sem-Minor-Project

    def lookup_compound(self,
                        phrase,
                        max_edit_distance,
                        ignore_non_words=False,
                        transfer_casing=False):
        """`lookup_compound` supports compound aware automatic spelling
        correction of multi-word input strings with three cases:

        1. mistakenly inserted space into a correct word led to two
           incorrect terms
        2. mistakenly omitted space between two correct words led to
           one incorrect combined term
        3. multiple independent input terms with/without spelling
           errors

        Find suggested spellings for a multi-word input string
        (supports word splitting/merging).

        **Args**:

        * phrase (str): The string being spell checked.
        * max_edit_distance (int): The maximum edit distance between\
            input and suggested words.
        * transfer_casing (bool): A flag to determine whether the
            casing (eg upper- vs lowercase) should be carried over\
            from the phrase

        **Returns**:
        A list of :class:`SuggestItem` object representing suggested\
            correct spellings for the input string.
        """
        # Parse input string into single terms
        term_list_1 = helpers.parse_words(phrase)
        # Second list of single terms with preserved cases so we can
        # ignore acronyms (all cap words)
        if ignore_non_words:
            term_list_2 = helpers.parse_words(phrase, True)
        suggestions = list()
        suggestion_parts = list()
        distance_comparer = EditDistance(self._distance_algorithm)

        # translate every item to its best suggestion, otherwise it
        # remains unchanged
        is_last_combi = False
        for i, __ in enumerate(term_list_1):
            if ignore_non_words:
                if helpers.try_parse_int64(term_list_1[i]) is not None:
                    suggestion_parts.append(SuggestItem(term_list_1[i], 0, 0))
                    continue
                if helpers.is_acronym(term_list_2[i]):
                    suggestion_parts.append(SuggestItem(term_list_2[i], 0, 0))
                    continue
            suggestions = self.lookup(term_list_1[i], Verbosity.TOP,
                                      max_edit_distance)
            # combi check, always before split
            if i > 0 and not is_last_combi:
                suggestions_combi = self.lookup(
                    term_list_1[i - 1] + term_list_1[i], Verbosity.TOP,
                    max_edit_distance)
                if suggestions_combi:
                    best_1 = suggestion_parts[-1]
                    if suggestions:
                        best_2 = suggestions[0]
                    else:
                        # estimated word occurrence probability
                        # P=10 / (N * 10^word length l)
                        best_2 = SuggestItem(term_list_1[i],
                                             max_edit_distance + 1,
                                             10 // 10**len(term_list_1[i]))
                    # distance_1=edit distance between 2 split terms and
                    # their best corrections : als comparative value
                    # for the combination
                    distance_1 = best_1.distance + best_2.distance
                    if (distance_1 >= 0 and
                        (suggestions_combi[0].distance + 1 < distance_1 or
                         (suggestions_combi[0].distance + 1 == distance_1 and
                          (suggestions_combi[0].count >
                           best_1.count / self.N * best_2.count)))):
                        suggestions_combi[0].distance += 1
                        suggestion_parts[-1] = suggestions_combi[0]
                        is_last_combi = True
                        continue
            is_last_combi = False

            # alway split terms without suggestion / never split terms
            # with suggestion ed=0 / never split single char terms
            if suggestions and (suggestions[0].distance == 0
                                or len(term_list_1[i]) == 1):
                # choose best suggestion
                suggestion_parts.append(suggestions[0])
            else:
                # if no perfect suggestion, split word into pairs
                suggestion_split_best = None
                # add original term
                if suggestions:
                    suggestion_split_best = suggestions[0]
                if len(term_list_1[i]) > 1:
                    for j in range(1, len(term_list_1[i])):
                        part_1 = term_list_1[i][:j]
                        part_2 = term_list_1[i][j:]
                        suggestions_1 = self.lookup(part_1, Verbosity.TOP,
                                                    max_edit_distance)
                        if suggestions_1:
                            suggestions_2 = self.lookup(
                                part_2, Verbosity.TOP, max_edit_distance)
                            if suggestions_2:
                                # select best suggestion for split pair
                                tmp_term = (suggestions_1[0].term + " " +
                                            suggestions_2[0].term)
                                tmp_distance = distance_comparer.compare(
                                    term_list_1[i], tmp_term,
                                    max_edit_distance)
                                if tmp_distance < 0:
                                    tmp_distance = max_edit_distance + 1
                                if suggestion_split_best is not None:
                                    if tmp_distance > suggestion_split_best.distance:
                                        continue
                                    if tmp_distance < suggestion_split_best.distance:
                                        suggestion_split_best = None
                                if tmp_term in self._bigrams:
                                    tmp_count = self._bigrams[tmp_term]
                                    # increase count, if split
                                    # corrections are part of or
                                    # identical to input single term
                                    # correction exists
                                    if suggestions:
                                        best_si = suggestions[0]
                                        # alternatively remove the
                                        # single term from
                                        # suggestion_split, but then
                                        # other splittings could win
                                        if suggestions_1[
                                                0].term + suggestions_2[
                                                    0].term == term_list_1[i]:
                                            # make count bigger than
                                            # count of single term
                                            # correction
                                            tmp_count = max(
                                                tmp_count, best_si.count + 2)
                                        elif (suggestions_1[0].term
                                              == best_si.term
                                              or suggestions_2[0].term
                                              == best_si.term):
                                            # make count bigger than
                                            # count of single term
                                            # correction
                                            tmp_count = max(
                                                tmp_count, best_si.count + 1)
                                    # no single term correction exists
                                    elif suggestions_1[0].term + suggestions_2[
                                            0].term == term_list_1[i]:
                                        tmp_count = max(
                                            tmp_count,
                                            max(suggestions_1[0].count,
                                                suggestions_2[0].count) + 2)
                                else:
                                    # The Naive Bayes probability of
                                    # the word combination is the
                                    # product of the two word
                                    # probabilities: P(AB)=P(A)*P(B)
                                    # use it to estimate the frequency
                                    # count of the combination, which
                                    # then is used to rank/select the
                                    # best splitting variant
                                    tmp_count = min(
                                        self.bigram_count_min,
                                        int(suggestions_1[0].count / self.N *
                                            suggestions_2[0].count))
                                suggestion_split = SuggestItem(
                                    tmp_term, tmp_distance, tmp_count)
                                if (suggestion_split_best is None
                                        or suggestion_split.count >
                                        suggestion_split_best.count):
                                    suggestion_split_best = suggestion_split

                    if suggestion_split_best is not None:
                        # select best suggestion for split pair
                        suggestion_parts.append(suggestion_split_best)
                        self._replaced_words[
                            term_list_1[i]] = suggestion_split_best
                    else:
                        si = SuggestItem(term_list_1[i], max_edit_distance + 1,
                                         int(10 / 10**len(term_list_1[i])))
                        suggestion_parts.append(si)
                        self._replaced_words[term_list_1[i]] = si
                else:
                    # estimated word occurrence probability
                    # P=10 / (N * 10^word length l)
                    si = SuggestItem(term_list_1[i], max_edit_distance + 1,
                                     int(10 / 10**len(term_list_1[i])))
                    suggestion_parts.append(si)
                    self._replaced_words[term_list_1[i]] = si
        joined_term = ""
        joined_count = self.N
        for si in suggestion_parts:
            joined_term += si.term + " "
            joined_count *= si.count / self.N
        joined_term = joined_term.rstrip()
        if transfer_casing:
            joined_term = helpers.transfer_casing_for_similar_text(
                phrase, joined_term)
        suggestion = SuggestItem(
            joined_term,
            distance_comparer.compare(phrase, joined_term, 2**31 - 1),
            int(joined_count))
        suggestions_line = list()
        suggestions_line.append(suggestion)

        return suggestions_line