def load_dictionary(self,
                        corpus,
                        term_index,
                        count_index,
                        separator=" ",
                        encoding=None):
        """Load multiple dictionary entries from a file of
        word/frequency count pairs. Merges with any dictionary data
        already loaded.

        **Args**:

        * corpus (str): The path+filename of the file.
        * term_index (int): The column position of the word.
        * count_index (int): The column position of the frequency\
            count.
        * encoding (str): Text encoding of the dictionary file

        **Returns**:
        True if file loaded, or False if file not found.
        """
        if not os.path.exists(corpus):
            return False
        with open(corpus, "r", encoding=encoding) as infile:
            for line in infile:
                line_parts = line.rstrip().split(separator)
                if len(line_parts) >= 2:
                    key = line_parts[term_index]
                    count = helpers.try_parse_int64(line_parts[count_index])
                    if count is not None:
                        self.create_dictionary_entry(key, count)
        return True
Beispiel #2
0
    def load_dictionary(self,
                        corpus,
                        term_index,
                        count_index,
                        encoding='utf-8'):
        """Load multiple dictionary entries from a file of
        word/frequency count pairs. Merges with any dictionary data
        already loaded.

        Keyword arguments:
        corpus -- The path+filename of the file.
        term_index -- The column position of the word.
        count_index -- The column position of the frequency count.
        encoding -- Text encoding of the dictionary file

        Return:
        True if file loaded, or False if file not found.
        """
        if not os.path.exists(corpus):
            return False
        with open(corpus, "r", encoding=encoding) as infile:
            for line in infile:
                line_parts = line.rstrip().split(" ")
                if len(line_parts) >= 2:
                    key = line_parts[term_index]
                    count = helpers.try_parse_int64(line_parts[count_index])
                    if count is not None:
                        self.create_dictionary_entry(key, count)
        return True
 def load_dictionary(self,
                     corpus,
                     term_index,
                     count_index,
                     separator=" ",
                     encoding=None):
     if not os.path.exists(corpus):
         return False
     with gzip.open(corpus, "rt", encoding=encoding) as infile:
         for line in infile:
             line_parts = line.rstrip().split(separator)
             if len(line_parts) >= 2:
                 key = line_parts[term_index]
                 count = helpers.try_parse_int64(line_parts[count_index])
                 if count is not None:
                     self.create_dictionary_entry(key, count)
     return True
 def load_bigram_dictionary(self,
                            corpus,
                            term_index,
                            count_index,
                            separator=None,
                            encoding=None):
     if not os.path.exists(corpus):
         return False
     with open(corpus, "r", encoding=encoding) as infile:
         for line in infile:
             line_parts = line.rstrip().split(separator)
             if len(line_parts) >= 3:
                 key = ("{} {}".format(line_parts[term_index],
                                       line_parts[term_index + 1])
                        if separator is None else line_parts[term_index])
                 count = helpers.try_parse_int64(line_parts[count_index])
                 if count is not None:
                     self._bigrams[key] = count
                     if count < self.bigram_count_min:
                         self.bigram_count_min = count
     return True
Beispiel #5
0
    def lookup_compound(self,
                        phrase,
                        max_edit_distance,
                        ignore_non_words=False):
        """lookup_compound supports compound aware automatic spelling
        correction of multi-word input strings with three cases:
        1. mistakenly inserted space into a correct word led to two incorrect
           terms
        2. mistakenly omitted space between two correct words led to one
           incorrect combined term
        3. multiple independent input terms with/without spelling errors

        Find suggested spellings for a multi-word input string (supports word
        splitting/merging).

        Keyword arguments:
        phrase -- The string being spell checked.
        max_edit_distance -- The maximum edit distance between input and
            suggested words.

        Return:
        A List of SuggestItem object representing suggested correct spellings
        for the input string.
        """
        # Parse input string into single terms
        term_list_1 = helpers.parse_words(phrase)
        # Second list of single terms with preserved cases so we can ignore
        # acronyms (all cap words)
        if ignore_non_words:
            term_list_2 = helpers.parse_words(phrase, True)
        suggestions = list()
        suggestion_parts = list()
        distance_comparer = EditDistance(self._distance_algorithm)

        # translate every item to its best suggestion, otherwise it remains
        # unchanged
        is_last_combi = False
        for i, __ in enumerate(term_list_1):
            if ignore_non_words:
                if helpers.try_parse_int64(term_list_1[i]) is not None:
                    suggestion_parts.append(SuggestItem(term_list_1[i], 0, 0))
                    continue
                # if re.match(r"\b[A-Z]{2,}\b", term_list_2[i]):
                if helpers.is_acronym(term_list_2[i]):
                    suggestion_parts.append(SuggestItem(term_list_2[i], 0, 0))
                    continue
            suggestions = self.lookup(term_list_1[i], Verbosity.TOP,
                                      max_edit_distance)
            # combi check, always before split
            if i > 0 and not is_last_combi:
                suggestions_combi = self.lookup(
                    term_list_1[i - 1] + term_list_1[i], Verbosity.TOP,
                    max_edit_distance)
                if suggestions_combi:
                    best_1 = suggestion_parts[-1]
                    if suggestions:
                        best_2 = suggestions[0]
                    else:
                        best_2 = SuggestItem(term_list_1[i],
                                             max_edit_distance + 1, 0)
                    # make sure we're comparing with the lowercase form of the
                    # previous word
                    distance_1 = distance_comparer.compare(
                        term_list_1[i - 1] + " " + term_list_1[i],
                        best_1.term.lower() + " " + best_2.term,
                        max_edit_distance)
                    if (distance_1 >= 0 and
                            suggestions_combi[0].distance + 1 < distance_1):
                        suggestions_combi[0].distance += 1
                        suggestion_parts[-1] = suggestions_combi[0]
                        is_last_combi = True
                        continue
            is_last_combi = False

            # alway split terms without suggestion / never split terms with
            # suggestion ed=0 / never split single char terms
            if (suggestions and
                (suggestions[0].distance == 0 or len(term_list_1[i]) == 1)):
                # choose best suggestion
                suggestion_parts.append(suggestions[0])
            else:
                # if no perfect suggestion, split word into pairs
                suggestions_split = list()
                # add original term
                if suggestions:
                    suggestions_split.append(suggestions[0])
                if len(term_list_1[i]) > 1:
                    for j in range(1, len(term_list_1[i])):
                        part_1 = term_list_1[i][:j]
                        part_2 = term_list_1[i][j:]
                        suggestions_1 = self.lookup(part_1, Verbosity.TOP,
                                                    max_edit_distance)
                        if suggestions_1:
                            # if split correction1 == einzelwort correction
                            if (suggestions and suggestions[0].term
                                    == suggestions_1[0].term):
                                break
                            suggestions_2 = self.lookup(
                                part_2, Verbosity.TOP, max_edit_distance)
                            if suggestions_2:
                                # if split correction1 == einzelwort correction
                                if (suggestions and suggestions[0].term
                                        == suggestions_2[0].term):
                                    break
                                # select best suggestion for split pair
                                tmp_term = (suggestions_1[0].term + " " +
                                            suggestions_2[0].term)
                                tmp_distance = distance_comparer.compare(
                                    term_list_1[i], tmp_term,
                                    max_edit_distance)
                                if tmp_distance < 0:
                                    tmp_distance = max_edit_distance + 1
                                tmp_count = min(suggestions_1[0].count,
                                                suggestions_2[0].count)
                                suggestion_split = SuggestItem(
                                    tmp_term, tmp_distance, tmp_count)
                                suggestions_split.append(suggestion_split)
                                # early termination of split
                                if suggestion_split.distance == 1:
                                    break

                    if suggestions_split:
                        # select best suggestion for split pair
                        suggestions_split.sort()
                        suggestion_parts.append(suggestions_split[0])
                    else:
                        si = SuggestItem(term_list_1[i], max_edit_distance + 1,
                                         0)
                        suggestion_parts.append(si)
                else:
                    si = SuggestItem(term_list_1[i], max_edit_distance + 1, 0)
                    suggestion_parts.append(si)
        joined_term = ""
        joined_count = sys.maxsize
        for si in suggestion_parts:
            joined_term += si.term + " "
            joined_count = min(joined_count, si.count)
        suggestion = SuggestItem(
            joined_term.rstrip(),
            distance_comparer.compare(phrase, joined_term, 2**31 - 1),
            joined_count)
        suggestions_line = list()
        suggestions_line.append(suggestion)
        return suggestions_line
    def lookup_compound(self,
                        phrase,
                        max_edit_distance,
                        ignore_non_words=False,
                        transfer_casing=False):
        """`lookup_compound` supports compound aware automatic spelling
        correction of multi-word input strings with three cases:

        1. mistakenly inserted space into a correct word led to two
           incorrect terms
        2. mistakenly omitted space between two correct words led to
           one incorrect combined term
        3. multiple independent input terms with/without spelling
           errors

        Find suggested spellings for a multi-word input string
        (supports word splitting/merging).

        **Args**:

        * phrase (str): The string being spell checked.
        * max_edit_distance (int): The maximum edit distance between\
            input and suggested words.
        * transfer_casing (bool): A flag to determine whether the
            casing (eg upper- vs lowercase) should be carried over\
            from the phrase

        **Returns**:
        A list of :class:`SuggestItem` object representing suggested\
            correct spellings for the input string.
        """
        # Parse input string into single terms
        term_list_1 = helpers.parse_words(phrase)
        # Second list of single terms with preserved cases so we can
        # ignore acronyms (all cap words)
        if ignore_non_words:
            term_list_2 = helpers.parse_words(phrase, True)
        suggestions = list()
        suggestion_parts = list()
        distance_comparer = EditDistance(self._distance_algorithm)

        # translate every item to its best suggestion, otherwise it
        # remains unchanged
        is_last_combi = False
        for i, __ in enumerate(term_list_1):
            if ignore_non_words:
                if helpers.try_parse_int64(term_list_1[i]) is not None:
                    suggestion_parts.append(SuggestItem(term_list_1[i], 0, 0))
                    continue
                if helpers.is_acronym(term_list_2[i]):
                    suggestion_parts.append(SuggestItem(term_list_2[i], 0, 0))
                    continue
            suggestions = self.lookup(term_list_1[i], Verbosity.TOP,
                                      max_edit_distance)
            # combi check, always before split
            if i > 0 and not is_last_combi:
                suggestions_combi = self.lookup(
                    term_list_1[i - 1] + term_list_1[i], Verbosity.TOP,
                    max_edit_distance)
                if suggestions_combi:
                    best_1 = suggestion_parts[-1]
                    if suggestions:
                        best_2 = suggestions[0]
                    else:
                        # estimated word occurrence probability
                        # P=10 / (N * 10^word length l)
                        best_2 = SuggestItem(term_list_1[i],
                                             max_edit_distance + 1,
                                             10 // 10**len(term_list_1[i]))
                    # distance_1=edit distance between 2 split terms and
                    # their best corrections : als comparative value
                    # for the combination
                    distance_1 = best_1.distance + best_2.distance
                    if (distance_1 >= 0 and
                        (suggestions_combi[0].distance + 1 < distance_1 or
                         (suggestions_combi[0].distance + 1 == distance_1 and
                          (suggestions_combi[0].count >
                           best_1.count / self.N * best_2.count)))):
                        suggestions_combi[0].distance += 1
                        suggestion_parts[-1] = suggestions_combi[0]
                        is_last_combi = True
                        continue
            is_last_combi = False

            # alway split terms without suggestion / never split terms
            # with suggestion ed=0 / never split single char terms
            if suggestions and (suggestions[0].distance == 0
                                or len(term_list_1[i]) == 1):
                # choose best suggestion
                suggestion_parts.append(suggestions[0])
            else:
                # if no perfect suggestion, split word into pairs
                suggestion_split_best = None
                # add original term
                if suggestions:
                    suggestion_split_best = suggestions[0]
                if len(term_list_1[i]) > 1:
                    for j in range(1, len(term_list_1[i])):
                        part_1 = term_list_1[i][:j]
                        part_2 = term_list_1[i][j:]
                        suggestions_1 = self.lookup(part_1, Verbosity.TOP,
                                                    max_edit_distance)
                        if suggestions_1:
                            suggestions_2 = self.lookup(
                                part_2, Verbosity.TOP, max_edit_distance)
                            if suggestions_2:
                                # select best suggestion for split pair
                                tmp_term = (suggestions_1[0].term + " " +
                                            suggestions_2[0].term)
                                tmp_distance = distance_comparer.compare(
                                    term_list_1[i], tmp_term,
                                    max_edit_distance)
                                if tmp_distance < 0:
                                    tmp_distance = max_edit_distance + 1
                                if suggestion_split_best is not None:
                                    if tmp_distance > suggestion_split_best.distance:
                                        continue
                                    if tmp_distance < suggestion_split_best.distance:
                                        suggestion_split_best = None
                                if tmp_term in self._bigrams:
                                    tmp_count = self._bigrams[tmp_term]
                                    # increase count, if split
                                    # corrections are part of or
                                    # identical to input single term
                                    # correction exists
                                    if suggestions:
                                        best_si = suggestions[0]
                                        # alternatively remove the
                                        # single term from
                                        # suggestion_split, but then
                                        # other splittings could win
                                        if suggestions_1[
                                                0].term + suggestions_2[
                                                    0].term == term_list_1[i]:
                                            # make count bigger than
                                            # count of single term
                                            # correction
                                            tmp_count = max(
                                                tmp_count, best_si.count + 2)
                                        elif (suggestions_1[0].term
                                              == best_si.term
                                              or suggestions_2[0].term
                                              == best_si.term):
                                            # make count bigger than
                                            # count of single term
                                            # correction
                                            tmp_count = max(
                                                tmp_count, best_si.count + 1)
                                    # no single term correction exists
                                    elif suggestions_1[0].term + suggestions_2[
                                            0].term == term_list_1[i]:
                                        tmp_count = max(
                                            tmp_count,
                                            max(suggestions_1[0].count,
                                                suggestions_2[0].count) + 2)
                                else:
                                    # The Naive Bayes probability of
                                    # the word combination is the
                                    # product of the two word
                                    # probabilities: P(AB)=P(A)*P(B)
                                    # use it to estimate the frequency
                                    # count of the combination, which
                                    # then is used to rank/select the
                                    # best splitting variant
                                    tmp_count = min(
                                        self.bigram_count_min,
                                        int(suggestions_1[0].count / self.N *
                                            suggestions_2[0].count))
                                suggestion_split = SuggestItem(
                                    tmp_term, tmp_distance, tmp_count)
                                if (suggestion_split_best is None
                                        or suggestion_split.count >
                                        suggestion_split_best.count):
                                    suggestion_split_best = suggestion_split

                    if suggestion_split_best is not None:
                        # select best suggestion for split pair
                        suggestion_parts.append(suggestion_split_best)
                        self._replaced_words[
                            term_list_1[i]] = suggestion_split_best
                    else:
                        si = SuggestItem(term_list_1[i], max_edit_distance + 1,
                                         int(10 / 10**len(term_list_1[i])))
                        suggestion_parts.append(si)
                        self._replaced_words[term_list_1[i]] = si
                else:
                    # estimated word occurrence probability
                    # P=10 / (N * 10^word length l)
                    si = SuggestItem(term_list_1[i], max_edit_distance + 1,
                                     int(10 / 10**len(term_list_1[i])))
                    suggestion_parts.append(si)
                    self._replaced_words[term_list_1[i]] = si
        joined_term = ""
        joined_count = self.N
        for si in suggestion_parts:
            joined_term += si.term + " "
            joined_count *= si.count / self.N
        joined_term = joined_term.rstrip()
        if transfer_casing:
            joined_term = helpers.transfer_casing_for_similar_text(
                phrase, joined_term)
        suggestion = SuggestItem(
            joined_term,
            distance_comparer.compare(phrase, joined_term, 2**31 - 1),
            int(joined_count))
        suggestions_line = list()
        suggestions_line.append(suggestion)

        return suggestions_line