예제 #1
0
    def test_suggestions(self):
        """Test the automatic spelling corrections"""

        spell_checker = HunSpelling(self.dic, self.aff)

        # top 1 candidates
        for query in json_controller.stream_field(self.samples, 'noisy'):
            candidates = spell_checker.correct(query, topn=1)
            self.assertTrue(len(candidates) <= 1)

        # top 5 candidates
        for query in json_controller.stream_field(self.samples, 'noisy'):
            candidates = spell_checker.correct(query, topn=5)
            self.assertTrue(len(candidates) <= 5)
예제 #2
0
class B1Correction:
    """
    Automatically correct spelling errors

    Baseline 1
    - use spacy for tokenization and named-entity detection
    - use hunspell for detecting isolated non-word spelling errors
      and suggesting candidate corrections
    - rerank candidates using a n-gram language model
    """
    def __init__(self):
        """Initialize the baseline"""

        self.nlp = None
        self.hunspell = None
        self.ngram = None

        self.logger = logging.getLogger(__name__)

    def load_spacy(self, nlp_model, disable=None):
        """Load the spacy NLP pipelines"""

        self.nlp = SpacyLoader(nlp_model, disable=disable)
        self.logger.info('Loaded spacy NLP model')

    def load_hunspell(self, dic_file, aff_file, extra_dic=None):
        """Load the hunspell analysis"""

        self.hunspell = HunSpelling(dic_file, aff_file, extra_dic=extra_dic)
        self.logger.info('Loaded hunspell checker')

    def load_ngram(self, ngram_model, **kwargs):
        """Load the n-gram language model"""

        self.ngram = LanguageModel(ngram_model, **kwargs)
        self.logger.info('Loaded n-gram language model')

    def correct(self, query, topn=5):
        """Return top candidate corrections for given query"""

        # recover tokens and flags for tokens to ignore by spellchecker
        tokens, flags = self.nlp.split_and_flag(query)

        # recover the list of correction suggestions made by hunspell
        candidates = self.hunspell.correct(tokens, ignore=flags)

        # re-order the candidates list by the n-gram language model
        candidates = self.ngram.order_sequences(candidates)

        # post-process sequences (remove spaces surrounding punctuation marks)
        candidates = [
            str_utils.remove_spaces_apostrophes(s) for s in candidates
        ]

        return candidates[:topn]
예제 #3
0
    def test_eval_suggestions(self):
        """Test the evaluation of automatic spelling corrections"""

        spell_checker = HunSpelling(self.dic, self.aff)

        gold_solutions = []
        cand_solutions = []

        # top 1 candidates
        for query, correction in json_controller.stream(
                self.samples, 'noisy', 'clean'):
            candidates = spell_checker.correct(query, topn=1)
            gold_solutions.append(correction)
            cand_solutions.append(candidates)

        evaluation = Evaluation()
        evaluation.load_from_lists(cand_solutions, gold_solutions)
        scores = evaluation.performance(1)

        # 1 / 300 correct suggestions ('méthode de recherche en histoire')
        self.assertEqual(scores, (0.33, 0.33, 0.33))
예제 #4
0
    def load_hunspell(self, dic_file, aff_file, extra_dic=None):
        """Load the hunspell analysis"""

        self.hunspell = HunSpelling(dic_file, aff_file, extra_dic=extra_dic)
        self.logger.info('Loaded hunspell checker')
예제 #5
0
    def test_new_dict(self):
        """Test adding an extra dictionary"""

        spell_checker = HunSpelling(self.dic, self.aff)
        spell_checker.add_extra_dictionary(self.dic)
        self.assertFalse(None, spell_checker)
예제 #6
0
    def test_spellings(self):
        """Check spellings"""

        spell_checker = HunSpelling(self.dic, self.aff)
        words = ['aide', 'brut', 'pourquoi', 'calcul', 'ville']
        errors = [spell_checker.is_misspelled(w) for w in words]
        self.assertEqual([False, False, True, False, True], errors)

        # incorrectly test adding new words
        spell_checker.add_words(None)

        # add new words and recheck spelling
        nwords = ['pourquoi', 'ville']
        spell_checker.add_words(nwords)
        errors = [spell_checker.is_misspelled(w) for w in nwords]
        self.assertEqual([False, False], errors)

        # remove the newly added words and recheck spelling
        spell_checker.remove_words(None)
        spell_checker.remove_words(nwords)
        errors = [spell_checker.is_misspelled(w) for w in nwords]
        self.assertEqual([True, True], errors)
예제 #7
0
    def test_extra_load(self):
        """Add an extra dictionary"""

        spell_checker = HunSpelling(self.dic, self.aff, self.dic)
        self.assertFalse(None, spell_checker)