Python Hunspell Exemples, hunspell.Hunspell Python Exemples

Exemple #1

0

Afficher le fichier

def spell_corrector(df, lang1, lang2):
    #Create an object of the Hunspell class
    h = Hunspell()
    print('I am spell_checker')
    #An empty list to hold the corrected sentences which would later be made into a dataframe
    corr_sent_list = {'L1': [], 'L2': []}
    #For each sentence in the dataframe
    for sent in df['L1']:

        #Empty string to which the corrected words are appended
        corr_sent = ''
        #For every word in the sentence. Which is split by word boundary
        for w in re.split(r'\b', sent):
            #If the split part is not a word (punctuation marks, spaces) or if it is a correct word, append it to corr_sent
            if not w.isalpha() or h.spell(w):
                corr_sent += w
            #If the split part is word and is incorrect
            else:
                #Suggest possible correct candidates to the incorrect word
                suggest = h.suggest(w)
                #If more than one word is suggested, more processing is required to select a word
                if len(suggest) > 1:
                    #TODO : Parse the list and find the n-gram probability to find the best candidate. For now it just appends the first word
                    corr_sent += suggest[0]
                #If only one word is suggested, append it to corr_sent
                else:
                    corr_sent += suggest[0]
        #When all the words in the sentence is traversed, append the corrected_sentence to corr_sent_list
        corr_sent_list['L1'].append(corr_sent)
    #Convert the corrected sentences list into pandas dataframe to return
    if lang2 is not None:
        corr_sent_list['L2'].extend(list(df['L2']))
        return pd.DataFrame.from_dict(corr_sent_list)
    else:
        return pd.DataFrame(corr_sent_list['L1'], columns=['L1'])

Exemple #2

0

Afficher le fichier

Fichier : hunspell_checker.py Projet : phymucs/Robustness-of-MT-DNNs

class HunspellChecker(object):
    def __init__(self):
        self.checker = Hunspell()
        self.stopwords = set(SW.words("english")) | set(string.punctuation)

    def correct_word(self, word):
        """Borrowed from:
        https://datascience.blog.wzb.eu/2016/07/13/autocorrecting-misspelled-words-in-python-using-hunspell/
        """
        ok = self.checker.spell(word)  # check spelling
        if not ok:
            suggestions = self.checker.suggest(word)
            if len(suggestions) > 0:  # there are suggestions
                return suggestions[0]
            else:
                return word
        else:
            return word

    def correct_string(self, text, ensure_length=False):
        """Break into words and correct each word."""
        tokens = text.split()
        corrected = []
        for token in tokens:
            if token in self.stopwords: corrected.append(token)
            else:
                correction = self.correct_word(token)
                if ensure_length:
                    corrected.append(correction.split()[0])
                else:
                    corrected.append(correction)
        return " ".join(corrected)

Exemple #3

0

Afficher le fichier

Fichier : Apply_Hunspell.py Projet : Lenala39/Spell-Checker-Comparison

def make_checker():
    '''
    creates a checker depending on the system running
    :return: Hunspell object h
    '''
    if platform.system() == 'Windows':
        h = Hunspell('de_DE_frami', hunspell_data_dir="C:\\Users\\Lena_Langholf\\Dropbox\\Spell_Checking\\dictionaries")
    else:
        h = Hunspell('de_DE_frami', hunspell_data_dir="/home/lena/Desktop/million_post_corpus/dictionaries")
    return h

Exemple #4

0

Afficher le fichier

Fichier : SpellChecker.py Projet : LuisVilarBarbosa/TextCategorizer

 def __init__(self,
              language='en_US',
              hunspell_data_dir='./hunspell',
              n_jobs=1):
     SpellChecker.get_dict(language, hunspell_data_dir)
     self.hunspell = Hunspell(language,
                              hunspell_data_dir=hunspell_data_dir,
                              disk_cache_dir=os.path.join(
                                  hunspell_data_dir, 'cache'))
     self.hunspell.set_concurrency(n_jobs)
     self.substitutes = dict()

Exemple #5

0

Afficher le fichier

    def __init__(self, dialect, script):

        self.dialect = dialect
        self.script = script 

        self.hunspell_flags = {"po": "pos", "is": "description", "ts": "terminal_suffix", "ds": "formation"}
        if self.dialect == "Sorani" and self.script == "Arabic":
            self.huns = Hunspell("ckb-Arab", hunspell_data_dir=klpt.get_data("data/"))
        else:
            if not (self.dialect == "Kurmanji" and self.script == "Latin"):
                raise Exception("Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!")

Exemple #6

0

Afficher le fichier

    def run(self) -> Generator[Tuple[int, int, str, type], None, None]:
        """Run the linter and return a generator of errors."""
        with open(self.filename, 'r') as file:
            comments = get_comments(file.read())

        # for comment in comments
        z = list(comments)

        spell = Hunspell()
        x = spell.spell(z[1][2][0])
        print(x)
        yield (0, 0, f'KOL001 Bad language found: ', TypoChecker)

Exemple #7

0

Afficher le fichier

Fichier : speller.py Projet : KeesCBakker/dutch-pluralizer-py

def new_hunspell_nl() -> Hunspell:

    dictionary_path = __resolve_path("../dict/")
    hnspl = Hunspell("nl-nl", hunspell_data_dir=str(dictionary_path))

    # add words that are not present in current dictionary
    for list in [get_plural_nouns(), get_basic_words()]:
        for word in list:
            if not hnspl.spell(word):
                hnspl.add(word)

    return hnspl

Exemple #8

0

Afficher le fichier

Fichier : hunspell_test.py Projet : javiber/cython_hunspell

    def test_non_overlapping_caches(self):
        test_suggest = self.h.suggest('testing')
        test_stem = self.h.stem('testing')

        self.h._suggest_cache['made-up'] = test_suggest
        self.assertEqual(self.h.suggest('made-up'), test_suggest)
        self.h._stem_cache['made-up'] = test_stem
        self.assertEqual(self.h.stem('made-up'), test_stem)

        h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
        self.assertNotEqual(h2.suggest('made-up'), test_suggest)
        self.assertNotEqual(h2.stem('made-up'), test_stem)

Exemple #9

0

Afficher le fichier

Fichier : hunspell_test.py Projet : javiber/cython_hunspell

 def test_bad_path_encoding(self, *mocks):
     if PY3:
         with self.assertRaises(HunspellFilePathError):
             Hunspell('not_checked',
                 hunspell_data_dir=u'bad/\udcc3/decoding')
     else:
         # Python 2 just make an illegal string instead of raising
         with captured_c_stderr_file() as caperr:
             Hunspell('not_checked',
                 hunspell_data_dir=u'bad/\udcc3/decoding')
             with open(caperr, 'r') as err:
                 self.assertRegexpSearch(err.read(), r'error:[^\n]*bad/[^\n]*/decoding')

Exemple #10

0

Afficher le fichier

Fichier : spellers.py Projet : GarryGaller/nlp_toolkit

    def __init__(
        self,
        lang='en',
        max_dist=2,
        cpu=os.cpu_count(),
        # cache_manager="hunspell",disk_cache_dir=None,
        # hunspell_data_dir=None,system_encoding=None
        spell_kwargs={}):

        self.lang = self.langs.get(lang, lang)
        self.spell_dict = Hunspell(self.lang, **spell_kwargs)
        self.max_dist = max_dist
        self.spell_dict.set_concurrency(cpu)

Exemple #11

0

Afficher le fichier

Fichier : hunspell_test.py Projet : javiber/cython_hunspell

    def test_overlapping_caches(self):
        test_suggest = self.h.suggest('testing')
        test_stem = self.h.stem('testing')

        self.h._suggest_cache['made-up'] = test_suggest
        self.assertEqual(self.h.suggest('made-up'), test_suggest)
        self.h._stem_cache['made-up'] = test_stem
        self.assertEqual(self.h.stem('made-up'), test_stem)

        del self.h
        self.h = Hunspell('test', hunspell_data_dir=DICT_DIR)
        self.assertEqual(self.h.suggest('made-up'), test_suggest)
        self.assertEqual(self.h.stem('made-up'), test_stem)

Exemple #12

0

Afficher le fichier

Fichier : hunspell_test.py Projet : twrodriguez/cython_hunspell

 def test_hunspell_bulk_stem(self):
     d = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
     self.assertDictEqual(d.bulk_action("stem", ['dog', 'permanently']), {
         'permanently': ['permanent'],
         'dog': ['dog']
     })
     self.assertDictEqual(d.bulk_action("stem", ['dog', 'twigs', 'permanently', 'unrecorded']), {
         'unrecorded': ['recorded'],
         'permanently': ['permanent'],
         'twigs': ['twig'],
         'dog': ['dog']
     })
     del d

Exemple #13

0

Afficher le fichier

class HunspellChecker(object):
    def __init__(self):
        self.checker = Hunspell()

    def correct(self, word):
        if self.checker.spell(word) == True:
            return word
        else:
            res = self.checker.suggest(word)
            if res:
                return res[0]
            else:
                return word

Exemple #14

0

Afficher le fichier

Fichier : __init__.py Projet : zhanglipku/mediacloud

    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__treebank_tokenizer = TreebankWordTokenizer()

        hunspell_dict_dir = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            'hindi-hunspell',
            'dict-hi_IN',
        )
        if not os.path.isdir(hunspell_dict_dir):
            raise McLanguageException(
                "Hunspell dictionary directory does not exist at path: %s." %
                hunspell_dict_dir)

        if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.dic')):
            raise McLanguageException(
                "Hunspell dictionary file does not exist at path: %s" %
                hunspell_dict_dir)
        if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.aff')):
            raise McLanguageException(
                "Hunspell affix file does not exist at path: %s" %
                hunspell_dict_dir)

        try:
            self.__hindi_hunspell = Hunspell(
                lang='hi_IN', hunspell_data_dir=hunspell_dict_dir)
        except Exception as ex:
            raise McLanguageException(
                "Unable to initialize Hunspell with data directory '%s': %s" %
                (
                    hunspell_dict_dir,
                    str(ex),
                ))

        # Quick self-test to make sure that Hunspell is installed and dictionary is available
        hunspell_exc_message = """
            Hunspell self-test failed; make sure that Hunspell is installed and dictionaries are accessible, e.g.
            you might need to fetch Git submodules by running:

                git submodule update --init --recursive
        """
        try:
            test_stems = self.stem_words(['गुरुओं'])
        except Exception as _:
            raise McLanguageException(hunspell_exc_message)
        else:
            if len(test_stems) == 0 or test_stems[0] != 'गुरु':
                raise McLanguageException(hunspell_exc_message)

Exemple #15

0

Afficher le fichier

 def test_hunspell_bulk_stem(self):
     d = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
     self.assertDictEqual(d.bulk_action("stem", ['dog', 'permanently']), {
         'permanently': ['permanent'],
         'dog': ['dog']
     })
     self.assertDictEqual(
         d.bulk_action("stem",
                       ['dog', 'twigs', 'permanently', 'unrecorded']), {
                           'unrecorded': ['recorded'],
                           'permanently': ['permanent'],
                           'twigs': ['twig'],
                           'dog': ['dog']
                       })
     del d

Exemple #16

0

Afficher le fichier

Fichier : hunspell_test.py Projet : javiber/cython_hunspell

    def test_clear_caches_non_peristance(self):
        test_suggest = self.h.suggest('testing')
        test_stem = self.h.stem('testing')

        self.h._suggest_cache['made-up'] = test_suggest
        self.assertEqual(self.h.suggest('made-up'), test_suggest)
        self.h._stem_cache['made-up'] = test_stem
        self.assertEqual(self.h.stem('made-up'), test_stem)

        self.h.clear_cache()

        del self.h
        self.h = Hunspell('test', hunspell_data_dir=DICT_DIR)
        self.assertNotEqual(self.h.suggest('made-up'), test_suggest)
        self.assertNotEqual(self.h.stem('made-up'), test_stem)

Exemple #17

0

Afficher le fichier

Fichier : hunspell_test.py Projet : ulwan/sihunspell_id

def test_non_overlapping_caches(hunspell):
    test_suggest = hunspell.suggest('testing')
    test_suffix = hunspell.suffix_suggest('testing')
    test_stem = hunspell.stem('testing')

    hunspell._suggest_cache['made-up'] = test_suggest
    assert hunspell.suggest('made-up') == test_suggest
    hunspell._suffix_cache['made-up'] = test_suffix
    assert hunspell.suffix_suggest('made-up') == test_suffix
    hunspell._stem_cache['made-up'] = test_stem
    assert hunspell.stem('made-up') == test_stem

    h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
    assert h2.suggest('made-up') != test_suggest
    assert h2.stem('made-up') != test_stem

Exemple #18

0

Afficher le fichier

Fichier : HunSpellChecker.py Projet : deeppunster/curio_demo

class HunSpellCheckerClass:
    """
    Check the spelling of a word.
    """
    def __init__(self):
        """
        Set up for the checking the spelling of a word.
        """
        debug('Initializing Hunspell')
        self.word_check = Hunspell()
        # config_list = self.word_check.ConfigKeys()
        # # print(config_list:'encoding')
        # for config_item in config_list:
        #     print('\n', config_item, config_list[config_item])

    def check_word(self, test_word: str) -> bool:
        """
        Check a word to see if it is spelled correctly.

        Note: It appears that a lot of abbreviations are in the aspell
        dictionary, such as 'ac' and 'cf'.  I will just have to manually
        weed them out with the ole Mark One eyeball.  :)

        :param test_word: word to check
        :return: true if spelled ok or false if not a valid word
        """
        debug(f'check_word received {test_word}')
        result = self.word_check.spell(test_word.lower())
        debug(f'check_word result {result}')
        return result

Exemple #19

0

Afficher le fichier

Fichier : lmgec.py Projet : blcuicall/gec-data-synthesis

def loadResources(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    # Language model built by KenLM: https://github.com/kpu/kenlm
    lm = kenlm.Model(args.model)
    # Load spaCy
    nlp = spacy.load("en")
    # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell
    # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower.
    gb = Hunspell("en_GB-large",
                  hunspell_data_dir=basename + '/resources/spelling/')
    # Inflection forms: http://wordlist.aspell.net/other/
    gb_infl = loadWordFormDict(basename +
                               "/resources/agid-2016.01.19/infl.txt")
    # List of common determiners
    det = {"", "the", "a", "an"}
    # List of common prepositions
    prep = {
        "", "about", "at", "by", "for", "from", "in", "of", "on", "to", "with"
    }
    # Save the above in a dictionary:
    res_dict = {
        "lm": lm,
        "nlp": nlp,
        "gb": gb,
        "gb_infl": gb_infl,
        "det": det,
        "prep": prep
    }
    return res_dict

Exemple #20

0

Afficher le fichier

Fichier : singularizer.py Projet : KeesCBakker/dutch-pluralizer-py

def create_fallback_options(singular: str, speller: Hunspell, options: [str],
                            stems: [str]) -> AdvancedSingularizationResult:
    options.append(singular)
    if speller.spell(singular):
        return AdvancedSingularizationResult(options, singular, stems, True,
                                             True)
    else:
        return AdvancedSingularizationResult(options, None, stems, False, True)

Exemple #21

0

Afficher le fichier

Fichier : replacements.py Projet : KeesCBakker/dutch-pluralizer-py

def search_by_dictionary_plus_s(speller: Hunspell, singular: str) -> Union[None, SearchResult]:

    plural = singular + 's'

    if speller.spell(plural):
        return SearchResult(plural, None,"s")

    return None

Exemple #22

0

Afficher le fichier

Fichier : hunspell_test.py Projet : javiber/cython_hunspell

 def test_windows_utf_8_encoding_applies_prefix(self, *mocks):
     with captured_c_stderr_file() as caperr:
         with patch("os.name", 'nt'):
             # If python file existance checks used prefix, this would raise a HunspellFilePathError
             Hunspell('test', system_encoding='UTF-8')
         with open(caperr, 'r') as err:
             # But the Hunspell library lookup had the prefix applied
             self.assertRegexpSearch(err.read(), r'error:[^\n]*/not/valid[^\n]*')

Exemple #23

0

Afficher le fichier

Fichier : replacements.py Projet : KeesCBakker/dutch-pluralizer-py

def search_by_dictionary(speller:Hunspell, plural: str) -> Union[None, SearchResult]:
    for e in __ending_pairs:
        for key in e.keys():
            if plural.endswith(key):
                suggestion = plural[0:0-len(key)] + e[key]
                if speller.spell(suggestion):
                    return SearchResult(suggestion, key, e[key])

    return None

Exemple #24

0

Afficher le fichier

Fichier : hunspell_test.py Projet : ulwan/sihunspell_id

def test_clear_caches_non_peristance(hunspell):
    test_suggest = hunspell.suggest('testing')
    test_suffix = hunspell.suffix_suggest('testing')
    test_stem = hunspell.stem('testing')

    hunspell._suggest_cache['made-up'] = test_suggest
    assert hunspell.suggest('made-up') == test_suggest
    hunspell._suffix_cache['made-up'] = test_suffix
    assert hunspell.suffix_suggest('made-up') == test_suffix
    hunspell._stem_cache['made-up'] = test_stem
    assert hunspell.stem('made-up') == test_stem

    hunspell.clear_cache()

    del hunspell
    hunspell = Hunspell('test', hunspell_data_dir=DICT_DIR)
    assert hunspell.suggest('made-up') != test_suggest
    assert hunspell.suffix_suggest('made-up') != test_suffix
    assert hunspell.stem('made-up') != test_stem

Exemple #25

0

Afficher le fichier

Fichier : hunspell_test.py Projet : twrodriguez/cython_hunspell

 def test_hunspell_bulk_suggest(self):
     d = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
     self.assertDictEqual(d.bulk_action("suggest", ['dog', 'dpg']), {
         'dpg': ['dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg', 'GDP'],
         'dog': ['dog']
     })
     self.assertDictEqual(d.bulk_action("suggest", ['dog', 'dpg', 'pgg', 'opg', 'dyg', 'frg', 'twg', 'bjn', 'foo', 'qre']), {
         'pgg': ['pg', 'peg', 'egg', 'pig', 'pug', 'pkg', 'pg g', 'PG'],
         'foo': ['few', 'goo', 'fop', 'foot', 'fool', 'food', 'foe', 'for', 'fro', 'too', 'fol', 'coo', 'fog', 'moo', 'fob'],
         'frg': ['fr', 'frig', 'frog', 'erg', 'fig', 'f*g', 'fro', 'fog', 'fry', 'fr g'],
         'twg': ['twig', 'tag', 'two', 'tog', 'tug', 'twp'],
         'bjn': ['bin', 'ban', 'bun', 'Bjorn'],
         'dog': ['dog'],
         'dpg': ['dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg', 'GDP'],
         'opg': ['op', 'pg', 'ope', 'ops', 'opt', 'mpg', 'opp', 'o pg', 'op g', 'GPO'],
         'dyg': ['dug', 'dye', 'deg', 'dig', 'dog', 'dying'],
         'qre': ['qr', 're', 'ere', 'ire', 'are', 'ore', 'Ore', 'Dre', 'q re', 'qr e']
     })
     del d

Exemple #26

0

Afficher le fichier

 def test_hunspell_bulk_suggest(self):
     d = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
     self.assertDictEqual(
         d.bulk_action("suggest", ['dog', 'dpg']), {
             'dpg': [
                 'dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg',
                 'GDP'
             ],
             'dog': ['dog']
         })
     self.assertDictEqual(
         d.bulk_action("suggest", [
             'dog', 'dpg', 'pgg', 'opg', 'dyg', 'frg', 'twg', 'bjn', 'foo',
             'qre'
         ]), {
             'pgg': ['pg', 'peg', 'egg', 'pig', 'pug', 'pkg', 'pg g', 'PG'],
             'foo': [
                 'few', 'goo', 'fop', 'foot', 'fool', 'food', 'foe', 'for',
                 'fro', 'too', 'fol', 'coo', 'fog', 'moo', 'fob'
             ],
             'frg': [
                 'fr', 'frig', 'frog', 'erg', 'fig', 'f*g', 'fro', 'fog',
                 'fry', 'fr g'
             ],
             'twg': ['twig', 'tag', 'two', 'tog', 'tug', 'twp'],
             'bjn': ['bin', 'ban', 'bun', 'Bjorn'],
             'dog': ['dog'],
             'dpg': [
                 'dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg',
                 'GDP'
             ],
             'opg': [
                 'op', 'pg', 'ope', 'ops', 'opt', 'mpg', 'opp', 'o pg',
                 'op g', 'GPO'
             ],
             'dyg': ['dug', 'dye', 'deg', 'dig', 'dog', 'dying'],
             'qre': [
                 'qr', 're', 'ere', 'ire', 'are', 'ore', 'Ore', 'Dre',
                 'q re', 'qr e'
             ]
         })
     del d

Exemple #27

0

Afficher le fichier

Fichier : singularizer.py Projet : KeesCBakker/dutch-pluralizer-py

def __stem(speller: Hunspell, plural: str) -> [str]:

    stems = list()
    for stem in speller.stem(plural):
        stem = stem.replace("ĳ", "ij")
        if len(plural) - len(stem) <= 3:
            ps = pluralize(stem)
            if ps == plural:
                stems.append(stem)

    return stems

Exemple #28

0

Afficher le fichier

Fichier : lmgec.py Projet : sai-prasanna/lmgec-lite

 def __init__(self, threshold=0.96):
     basename = os.path.dirname(os.path.realpath(__file__))
     self.lm = LanguageModel()
     # Load spaCy
     self.nlp = spacy.load("en")
     # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell
     # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower.
     self.gb = Hunspell("en_GB-large",
                        hunspell_data_dir=basename + '/resources/spelling/')
     # Inflection forms: http://wordlist.aspell.net/other/
     self.gb_infl = loadWordFormDict(basename +
                                     "/resources/agid-2016.01.19/infl.txt")
     # List of common determiners
     self.determiners = {"", "the", "a", "an"}
     # List of common prepositions
     self.prepositions = {
         "", "about", "at", "by", "for", "from", "in", "of", "on", "to",
         "with"
     }
     self.threshold = threshold

Exemple #29

0

Afficher le fichier

Fichier : feature_generation.py Projet : hashes4merkle/doppelgaenger-detection

 def leetScan(string, valDict, language="EN"):
     leetcandidates = []
     count = 0
     h = Hunspell('en_US', hunspell_data_dir='/Library/Spelling')
     tokens = nltk.word_tokenize(string)
     # Calculate Total Words in string
     total_words = len(tokens)
     for token in tokens:
         # Check for misspelling
         if h.spell(token) == False:
             # See if word contains leet
             if leetCheck(token):
                 # Add to possible candidate list
                 leetcandidates.append(token)
     # Test candidate list for word validity using swapping
     for candidate in leetcandidates:
         if swapValid(candidate, valDict, h):
             count = count + 1
     fraction = Fraction(count, total_words)
     return fraction

Exemple #30

0

Afficher le fichier

    def hunspell(self) -> Hunspell:
        """
        Returns the (cached) Hunspell instance
        """

        if not self._hunspell:
            self._hunspell = Hunspell(
                self.lang.get_hunspell_dict_name(),
                hunspell_data_dir=self.hunspell_data_dir,
            )

        return self._hunspell

Exemple #31

0

Afficher le fichier

def pluralize_advanced(
        singular: str,
        speller: Hunspell = None,
        ending_overrides: NounEndingMap = None) -> AdvancedPluralizationResult:

    if not speller:
        speller = ensure_hunspell_nl()

    plural = __pluralize(singular, ending_overrides)

    # empty plural - just stop
    if not plural:
        return AdvancedPluralizationResult(plural, None, (), None, None, False)

    # right spelled plural
    if speller.spell(plural):
        return AdvancedPluralizationResult(plural, plural, (), None, None,
                                           True)

    # if no rightly spelled word can be found, use suggestions,
    # replacement of the endings and the Hunspell dictionary if
    # we can find something that is spelled correctly.
    suggestions = speller.suggest(plural)
    search_result:SearchResult = \
        search_by_suggestions(plural, suggestions) or \
        search_by_dictionary(speller, plural) or \
        search_by_dictionary_plus_s(speller, singular)

    if search_result:
        return AdvancedPluralizationResult(plural, search_result.plural,
                                           suggestions,
                                           search_result.switched_ending_from,
                                           search_result.switched_ending_to,
                                           True)

    return AdvancedPluralizationResult(plural, None, (), None, None, False)

Exemple #32

0

Afficher le fichier

Fichier : SpellChecker.py Projet : LuisVilarBarbosa/TextCategorizer

class SpellChecker:
    def __init__(self,
                 language='en_US',
                 hunspell_data_dir='./hunspell',
                 n_jobs=1):
        SpellChecker.get_dict(language, hunspell_data_dir)
        self.hunspell = Hunspell(language,
                                 hunspell_data_dir=hunspell_data_dir,
                                 disk_cache_dir=os.path.join(
                                     hunspell_data_dir, 'cache'))
        self.hunspell.set_concurrency(n_jobs)
        self.substitutes = dict()

    def spell_check(self, tokenized_corpus_2d):
        tokens = {t for iterable in tokenized_corpus_2d for t in iterable}
        new_tokens = tokens - self.substitutes.keys()
        correct_tokens = {t for t in new_tokens if self.hunspell.spell(t)}
        self.substitutes.update(map(lambda t: (t, t), correct_tokens))
        tokens_to_check = new_tokens - correct_tokens
        suggestions = self.hunspell.bulk_suggest(tokens_to_check)
        self.substitutes.update(
            map(lambda kv: (kv[0], kv[0]) if not kv[1] else (kv[0], kv[1][0]),
                suggestions.items()))
        new_corpus = [[self.substitutes[token] for token in iterable]
                      for iterable in tokenized_corpus_2d]
        return new_corpus

    @staticmethod
    def get_dict(language, data_dir):
        os.makedirs(data_dir, exist_ok=True)
        for ext in ['aff', 'dic']:
            path = os.path.join(data_dir, '%s.%s' % (language, ext))
            if not os.path.exists(path):
                r = get(
                    'https://raw.githubusercontent.com/LibreOffice/dictionaries/master/%s/%s.%s'
                    % (language, language, ext))
                if r.status_code == 404:
                    l = language[0:language.find('_')]
                    r = get(
                        'https://raw.githubusercontent.com/LibreOffice/dictionaries/master/%s/%s.%s'
                        % (l, language, ext))
                    r.raise_for_status()
                f = open(path, 'wb')
                f.write(r.content)
                f.close()

    def __del__(self):
        self.hunspell.save_cache()  # For future program executions.

Exemple #33

0

Afficher le fichier

Fichier : hunspell_test.py Projet : twrodriguez/cython_hunspell

 def test_hunspell_suggest(self):
     d = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
     self.assertListEqual(d.suggest('dpg'), ['dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg', 'GDP'])
     del d

Exemple #34

0

Afficher le fichier

Fichier : hunspell_test.py Projet : MSeal/cython_hunspell

 def setUp(self):
     self.h = Hunspell('test', hunspell_data_dir=DICT_DIR)

Exemple #35

0

Afficher le fichier

Fichier : hunspell_test.py Projet : twrodriguez/cython_hunspell

 def test_hunspell_spell(self):
     d = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
     self.assertFalse(d.spell('dpg'))
     self.assertTrue(d.spell('dog'))
     del d

Exemple #36

0

Afficher le fichier

Fichier : hunspell_test.py Projet : MSeal/cython_hunspell

class HunspellTest(unittest.TestCase):
    def setUp(self):
        self.h = Hunspell('test', hunspell_data_dir=DICT_DIR)

    def tearDown(self):
        try:
            del self.h
        except AttributeError:
            pass

    def assertAllIn(self, checked, expected):
        self.assertTrue(all(x in expected for x in checked),
            u"{} not all found in {}".format(checked, expected))

    def test_create_destroy(self):
        del self.h

    def test_missing_dict(self):
        with self.assertRaises(IOError):
            Hunspell('not_avail', hunspell_data_dir=DICT_DIR)

    def test_spell(self):
        self.assertFalse(self.h.spell('dpg'))
        self.assertTrue(self.h.spell('dog'))

    def test_spell_utf8(self):
        self.assertTrue(self.h.spell(u'café'))
        self.assertFalse(self.h.spell(u'uncafé'))

    def test_spell_empty(self):
        self.assertTrue(self.h.spell(''))

    def test_suggest(self):
        required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg')
        suggest = self.h.suggest('dpg')
        self.assertIsInstance(suggest, tuple)
        self.assertAllIn(required, suggest)

    def test_suggest_utf8(self):
        required = (u'café', u'Cerf')
        for variant in ('cefé', u'cefé'):
            suggest = self.h.suggest(variant)
            self.assertIsInstance(suggest, tuple)
            self.assertAllIn(required, suggest)

    def test_suggest_empty(self):
        self.assertEqual(self.h.suggest(''), ())

    def test_stem(self):
        self.assertEqual(self.h.stem('dog'), ('dog',))
        self.assertEqual(self.h.stem('permanently'), ('permanent',))

    def test_bulk_suggest(self):
        self.h.set_concurrency(3)
        suggest = self.h.bulk_suggest(['dog', 'dpg'])
        self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg'])
        self.assertIsInstance(suggest['dog'], tuple)
        self.assertAllIn(('dog',), suggest['dog'])

        required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg')
        self.assertIsInstance(suggest['dpg'], tuple)
        self.assertAllIn(required, suggest['dpg'])

        checked = ['bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg']
        suggest = self.h.bulk_suggest(checked)
        self.assertEqual(sorted(suggest.keys()), checked)

    def test_bulk_stem(self):
        self.h.set_concurrency(3)
        self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), {
            'permanently': ('permanent',),
            'dog': ('dog',)
        })
        self.assertDictEqual(self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), {
            'unrecorded': ('recorded',),
            'permanently': ('permanent',),
            'twigs': ('twig',),
            'dog': ('dog',)
        })

Exemple #37

0

Afficher le fichier

Fichier : hunspell_test.py Projet : twrodriguez/cython_hunspell

 def test_hunspell_stem(self):
     d = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
     self.assertListEqual(d.stem('dog'), ['dog'])
     self.assertListEqual(d.stem('permanently'), ['permanent'])
     del d