Ejemplo n.º 1
0
    def test_word_segmentation_with_arguments(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = os.path.realpath(os.path.dirname(__file__))
        dictionary_path = os.path.realpath(
            os.path.join(cwd, pardir, "symspellpy",
                         "frequency_dictionary_en_82_765.txt"))

        edit_distance_max = 0
        prefix_length = 7
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        typo = "thequickbrownfoxjumpsoverthelazydog"
        correction = "the quick brown fox jumps over the lazy dog"
        result = sym_spell.word_segmentation(typo, edit_distance_max, 11)
        self.assertEqual(correction, result.corrected_string)

        typo = "itwasabrightcolddayinaprilandtheclockswerestrikingthirteen"
        correction = ("it was a bright cold day in april and the clocks "
                      "were striking thirteen")
        result = sym_spell.word_segmentation(typo, edit_distance_max, 11)
        self.assertEqual(correction, result[1])

        typo = (" itwasthebestoftimesitwastheworstoftimesitwastheageofwisdom"
                "itwastheageoffoolishness")
        correction = ("it was the best of times it was the worst of times "
                      "it was the age of wisdom it was the age of foolishness")
        result = sym_spell.word_segmentation(typo, edit_distance_max, 11)
        self.assertEqual(correction, result[1])
Ejemplo n.º 2
0
    def test_word_segmentation_ignore_token(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        typo = "24th december"
        result = sym_spell.word_segmentation(typo, ignore_token=r"\d{2}\w*\b")
        self.assertEqual(typo, result.corrected_string)
Ejemplo n.º 3
0
    def test_word_segmentation_with_arguments(self):
        edit_distance_max = 0
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        typo = "thequickbrownfoxjumpsoverthelazydog"
        correction = "the quick brown fox jumps over the lazy dog"
        result = sym_spell.word_segmentation(typo, edit_distance_max, 11)
        self.assertEqual(correction, result.corrected_string)

        typo = "itwasabrightcolddayinaprilandtheclockswerestrikingthirteen"
        correction = ("it was a bright cold day in april and the clocks "
                      "were striking thirteen")
        result = sym_spell.word_segmentation(typo, edit_distance_max, 11)
        self.assertEqual(correction, result.corrected_string)

        typo = (" itwasthebestoftimesitwastheworstoftimesitwastheageofwisdom"
                "itwastheageoffoolishness")
        correction = ("it was the best of times it was the worst of times "
                      "it was the age of wisdom it was the age of foolishness")
        result = sym_spell.word_segmentation(typo, edit_distance_max, 11)
        self.assertEqual(correction, result.corrected_string)
Ejemplo n.º 4
0
    def test_word_segmentation_ignore_token(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = os.path.realpath(os.path.dirname(__file__))
        dictionary_path = os.path.realpath(
            os.path.join(cwd, pardir, "symspellpy",
                         "frequency_dictionary_en_82_765.txt"))

        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        typo = "24th december"
        result = sym_spell.word_segmentation(typo, ignore_token=r"\d{2}\w*\b")
        self.assertEqual(typo, result.corrected_string)
Ejemplo n.º 5
0
class WordSuggester:
    """
    Suggest words when the input is mispelled
    """
    def __init__(self, ):
        d_print("Initializing the vocabulary set..")
        self.d = enchant.Dict("en_US")
        d_print("Initializing BERT pipeline..")

        self.tok = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.bert = BertForMaskedLM.from_pretrained("bert-base-uncased")
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        self.sym_spell_cut = SymSpell(max_dictionary_edit_distance=0,
                                      prefix_length=7)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")
        # term_index is the column of the term and count_index is the
        # column of the term frequency
        self.sym_spell.load_dictionary(dictionary_path,
                                       term_index=0,
                                       count_index=1)
        self.sym_spell_cut.load_dictionary(dictionary_path,
                                           term_index=0,
                                           count_index=1)

    def cross_word_validate(self, word, word_counts, min_counts=2):
        """
        A word is considered valid if it occures many times or
        """
        tot = sum(word_counts.values())
        return word_counts[word] >= min_counts

    def is_multiword(self, word):
        suggestions = self.d.suggest(word)
        for sugg in suggestions:
            if "".join(sugg.split(" ")) == word:
                return True, sugg
            if "".join(sugg.split("-")) == word:
                return True, sugg.replace("-", " ")
        return False, ""

    def cross_sugg_validate(self, word, word_counts):
        suggestions = [
            s.term for s in self.sym_spell.lookup(
                word, Verbosity.CLOSEST, max_edit_distance=2)
        ]
        present_words = {
            word: count
            for word, count in word_counts.items() if word in suggestions
        }
        if len(present_words) == 0:
            return False, ""
        corr_word = max(present_words.items(), key=operator.itemgetter(1))[0]
        return True, corr_word

    def get_word_suggestions(self, word, word_counts):
        """
        Return the suggestions for the word passed in parameter. If the
        word passed in parameter is valid, return a list of len 1 with the
        word inside.

        Args:
            word (str): the word to find suggestions for
            word_counts (dict): value counts of word for a given emoji (context)
        """
        # If the word appears many times in answers, we keep it
        if self.cross_word_validate(word, word_counts):
            return {"status": "present", "words": [word]}

        # If the word is part of the english vocabulary we keep it
        if self.d.check(word):
            return {"status": "exist", "words": [word]}

        # If the suggestions associated to the word appear in the rest of the answers
        # we keep the most common one
        cross_sugg, corr_word = self.cross_sugg_validate(word, word_counts)
        if cross_sugg:
            return {"status": "cross_suggested", "words": [corr_word]}

        # If the cutting of the word into several words is very confident, we disassemble it
        result = self.sym_spell_cut.word_segmentation(word)
        log_confidence = result.log_prob_sum / len(result.corrected_string)
        if log_confidence > -1:
            suggestions = result.corrected_string
            return {
                "status": "disassembled1",
                "words": [result.corrected_string]
            }

        # Same approach using another library
        is_multi, corr_word = self.is_multiword(word)
        if is_multi:
            return {"status": "disassembled2", "words": [corr_word]}

        # We use the other words as a context to select among the suggestions
        suggestions = [
            sugg.term for sugg in self.sym_spell.lookup(
                word, Verbosity.CLOSEST, max_edit_distance=2)
        ]
        if len(suggestions) > 0:
            return {"status": "corrected", "words": suggestions}

        # The word is probably unknown
        return {"status": "notfound", "words": [word]}

    def get_context_suggestions(self, word_list):
        """
        Applies get_word_suggestions for every word of an emoji's vocabulary (context)

        Args:
            word_list (list of str): words to describe the emoji

        Returns:
            [list of list of str]: list of suggestions: each word receives suggestions (list of str)
        """
        word_counts = Counter(word_list)
        context_suggestions = [
            self.get_word_suggestions(word, word_counts) for word in word_list
        ]
        return context_suggestions

    def find_best_word(self, context, suggestions):
        """
        Find the most appropriate word in suggestions given the context

        Args:
            context (list of str): words defining the context
            suggestions (list of str): suggestions for the word to find

        Returns:
            [str]: the word of suggestions that matches the best the context
            according to BERT output
        """
        # We place the word of interest in the middle of the context
        n = len(context) // 2
        pre_context = " ".join(context[:n])
        post_context = " ".join(context[n:])
        sentence = f"{pre_context} {self.tok.mask_token} {post_context}"

        input_tokens = self.tok.encode(sentence)
        answer_pos = input_tokens.index(self.tok.mask_token_id)

        logits = self.bert(torch.tensor([input_tokens]))[0][0]
        logits = logits[answer_pos]
        suggestions_tokens = [
            self.tok.encode(word)[1:-1] for word in suggestions
        ]
        scores = [
            np.mean([logits[i].item() for i in tokens])
            for tokens in suggestions_tokens
        ]
        best_sugg_idx = np.argmax(scores)
        return suggestions[best_sugg_idx]

    def extract_context_suggestions(self, context_suggestions):
        """
        Extract best words for each suggestions in the context suggestions

        Args:
            context_suggestions (list of list of str): list of suggestions

        Returns:
            [list of str]: most appropriate words

        """
        # we don't need the status in the current function
        context_suggestions = [sugg["words"] for sugg in context_suggestions]
        ret_words = []
        for suggestions in context_suggestions:
            # single suggestion: the word is not ambiguous
            if len(suggestions) == 1:
                ret_words.append(suggestions[0])
            else:
                # we gather the single words considered as healthy
                context = [
                    word_list[0] for word_list in context_suggestions
                    if word_list != suggestions and len(word_list) == 1
                ]
                word = self.find_best_word(context, suggestions)

                ret_words.append(word)
        return ret_words

    def process_context(self, context, verbose=False):
        """
        Args:
            context (list of str): words

        Returns:
            [list of str]: corrected words
        """
        if os.environ.get("DEBUG") is not None:
            d_print("Test --> test")
            d_print("Test --> test")
            return context
        context_suggestions = self.get_context_suggestions(context)
        corr_words = self.extract_context_suggestions(context_suggestions)
        if verbose:
            for word, suggestions, corr_word in zip(context,
                                                    context_suggestions,
                                                    corr_words):
                status = suggestions["status"]
                if status == "notfound":
                    d_print(f"Nof found:  {word}")
                elif status not in ["present", "exist"] and word != corr_word:
                    d_print(f"Modified:  {word} --> {corr_word} ({status})")

        return corr_words

    def correct_prod_df(self, form_df, debug=False):
        """
        Correct inplace mispelled words of a dataframe in productions format
        """
        grouped_df = form_df.groupby("emoji")
        # TODO: remove the limitation
        em_indexes = [(key, val) for key, val in grouped_df.groups.items()]

        for emoji, indexes in tqdm(em_indexes):
            group = grouped_df.get_group(emoji)["word"]
            words = group.to_list()
            corr_words = self.process_context(words, verbose=True)
            form_df["word"].loc[indexes] = corr_words