class HunspellChecker(object):
    def __init__(self):
        self.checker = Hunspell()
        self.stopwords = set(SW.words("english")) | set(string.punctuation)

    def correct_word(self, word):
        """Borrowed from:
        https://datascience.blog.wzb.eu/2016/07/13/autocorrecting-misspelled-words-in-python-using-hunspell/
        """
        ok = self.checker.spell(word)  # check spelling
        if not ok:
            suggestions = self.checker.suggest(word)
            if len(suggestions) > 0:  # there are suggestions
                return suggestions[0]
            else:
                return word
        else:
            return word

    def correct_string(self, text, ensure_length=False):
        """Break into words and correct each word."""
        tokens = text.split()
        corrected = []
        for token in tokens:
            if token in self.stopwords: corrected.append(token)
            else:
                correction = self.correct_word(token)
                if ensure_length:
                    corrected.append(correction.split()[0])
                else:
                    corrected.append(correction)
        return " ".join(corrected)
class HunSpellCheckerClass:
    """
    Check the spelling of a word.
    """
    def __init__(self):
        """
        Set up for the checking the spelling of a word.
        """
        debug('Initializing Hunspell')
        self.word_check = Hunspell()
        # config_list = self.word_check.ConfigKeys()
        # # print(config_list:'encoding')
        # for config_item in config_list:
        #     print('\n', config_item, config_list[config_item])

    def check_word(self, test_word: str) -> bool:
        """
        Check a word to see if it is spelled correctly.

        Note: It appears that a lot of abbreviations are in the aspell
        dictionary, such as 'ac' and 'cf'.  I will just have to manually
        weed them out with the ole Mark One eyeball.  :)

        :param test_word: word to check
        :return: true if spelled ok or false if not a valid word
        """
        debug(f'check_word received {test_word}')
        result = self.word_check.spell(test_word.lower())
        debug(f'check_word result {result}')
        return result
Exemple #3
0
def spell_corrector(df, lang1, lang2):
    #Create an object of the Hunspell class
    h = Hunspell()
    print('I am spell_checker')
    #An empty list to hold the corrected sentences which would later be made into a dataframe
    corr_sent_list = {'L1': [], 'L2': []}
    #For each sentence in the dataframe
    for sent in df['L1']:

        #Empty string to which the corrected words are appended
        corr_sent = ''
        #For every word in the sentence. Which is split by word boundary
        for w in re.split(r'\b', sent):
            #If the split part is not a word (punctuation marks, spaces) or if it is a correct word, append it to corr_sent
            if not w.isalpha() or h.spell(w):
                corr_sent += w
            #If the split part is word and is incorrect
            else:
                #Suggest possible correct candidates to the incorrect word
                suggest = h.suggest(w)
                #If more than one word is suggested, more processing is required to select a word
                if len(suggest) > 1:
                    #TODO : Parse the list and find the n-gram probability to find the best candidate. For now it just appends the first word
                    corr_sent += suggest[0]
                #If only one word is suggested, append it to corr_sent
                else:
                    corr_sent += suggest[0]
        #When all the words in the sentence is traversed, append the corrected_sentence to corr_sent_list
        corr_sent_list['L1'].append(corr_sent)
    #Convert the corrected sentences list into pandas dataframe to return
    if lang2 is not None:
        corr_sent_list['L2'].extend(list(df['L2']))
        return pd.DataFrame.from_dict(corr_sent_list)
    else:
        return pd.DataFrame(corr_sent_list['L1'], columns=['L1'])
def create_fallback_options(singular: str, speller: Hunspell, options: [str],
                            stems: [str]) -> AdvancedSingularizationResult:
    options.append(singular)
    if speller.spell(singular):
        return AdvancedSingularizationResult(options, singular, stems, True,
                                             True)
    else:
        return AdvancedSingularizationResult(options, None, stems, False, True)
def search_by_dictionary_plus_s(speller: Hunspell, singular: str) -> Union[None, SearchResult]:

    plural = singular + 's'

    if speller.spell(plural):
        return SearchResult(plural, None,"s")

    return None
def search_by_dictionary(speller:Hunspell, plural: str) -> Union[None, SearchResult]:
    for e in __ending_pairs:
        for key in e.keys():
            if plural.endswith(key):
                suggestion = plural[0:0-len(key)] + e[key]
                if speller.spell(suggestion):
                    return SearchResult(suggestion, key, e[key])

    return None
def new_hunspell_nl() -> Hunspell:

    dictionary_path = __resolve_path("../dict/")
    hnspl = Hunspell("nl-nl", hunspell_data_dir=str(dictionary_path))

    # add words that are not present in current dictionary
    for list in [get_plural_nouns(), get_basic_words()]:
        for word in list:
            if not hnspl.spell(word):
                hnspl.add(word)

    return hnspl
Exemple #8
0
    def run(self) -> Generator[Tuple[int, int, str, type], None, None]:
        """Run the linter and return a generator of errors."""
        with open(self.filename, 'r') as file:
            comments = get_comments(file.read())

        # for comment in comments
        z = list(comments)

        spell = Hunspell()
        x = spell.spell(z[1][2][0])
        print(x)
        yield (0, 0, f'KOL001 Bad language found: ', TypoChecker)
Exemple #9
0
class HunspellChecker(object):
    def __init__(self):
        self.checker = Hunspell()

    def correct(self, word):
        if self.checker.spell(word) == True:
            return word
        else:
            res = self.checker.suggest(word)
            if res:
                return res[0]
            else:
                return word
class SpellChecker:
    def __init__(self,
                 language='en_US',
                 hunspell_data_dir='./hunspell',
                 n_jobs=1):
        SpellChecker.get_dict(language, hunspell_data_dir)
        self.hunspell = Hunspell(language,
                                 hunspell_data_dir=hunspell_data_dir,
                                 disk_cache_dir=os.path.join(
                                     hunspell_data_dir, 'cache'))
        self.hunspell.set_concurrency(n_jobs)
        self.substitutes = dict()

    def spell_check(self, tokenized_corpus_2d):
        tokens = {t for iterable in tokenized_corpus_2d for t in iterable}
        new_tokens = tokens - self.substitutes.keys()
        correct_tokens = {t for t in new_tokens if self.hunspell.spell(t)}
        self.substitutes.update(map(lambda t: (t, t), correct_tokens))
        tokens_to_check = new_tokens - correct_tokens
        suggestions = self.hunspell.bulk_suggest(tokens_to_check)
        self.substitutes.update(
            map(lambda kv: (kv[0], kv[0]) if not kv[1] else (kv[0], kv[1][0]),
                suggestions.items()))
        new_corpus = [[self.substitutes[token] for token in iterable]
                      for iterable in tokenized_corpus_2d]
        return new_corpus

    @staticmethod
    def get_dict(language, data_dir):
        os.makedirs(data_dir, exist_ok=True)
        for ext in ['aff', 'dic']:
            path = os.path.join(data_dir, '%s.%s' % (language, ext))
            if not os.path.exists(path):
                r = get(
                    'https://raw.githubusercontent.com/LibreOffice/dictionaries/master/%s/%s.%s'
                    % (language, language, ext))
                if r.status_code == 404:
                    l = language[0:language.find('_')]
                    r = get(
                        'https://raw.githubusercontent.com/LibreOffice/dictionaries/master/%s/%s.%s'
                        % (l, language, ext))
                    r.raise_for_status()
                f = open(path, 'wb')
                f.write(r.content)
                f.close()

    def __del__(self):
        self.hunspell.save_cache()  # For future program executions.
 def leetScan(string, valDict, language="EN"):
     leetcandidates = []
     count = 0
     h = Hunspell('en_US', hunspell_data_dir='/Library/Spelling')
     tokens = nltk.word_tokenize(string)
     # Calculate Total Words in string
     total_words = len(tokens)
     for token in tokens:
         # Check for misspelling
         if h.spell(token) == False:
             # See if word contains leet
             if leetCheck(token):
                 # Add to possible candidate list
                 leetcandidates.append(token)
     # Test candidate list for word validity using swapping
     for candidate in leetcandidates:
         if swapValid(candidate, valDict, h):
             count = count + 1
     fraction = Fraction(count, total_words)
     return fraction
Exemple #12
0
def pluralize_advanced(
        singular: str,
        speller: Hunspell = None,
        ending_overrides: NounEndingMap = None) -> AdvancedPluralizationResult:

    if not speller:
        speller = ensure_hunspell_nl()

    plural = __pluralize(singular, ending_overrides)

    # empty plural - just stop
    if not plural:
        return AdvancedPluralizationResult(plural, None, (), None, None, False)

    # right spelled plural
    if speller.spell(plural):
        return AdvancedPluralizationResult(plural, plural, (), None, None,
                                           True)

    # if no rightly spelled word can be found, use suggestions,
    # replacement of the endings and the Hunspell dictionary if
    # we can find something that is spelled correctly.
    suggestions = speller.suggest(plural)
    search_result:SearchResult = \
        search_by_suggestions(plural, suggestions) or \
        search_by_dictionary(speller, plural) or \
        search_by_dictionary_plus_s(speller, singular)

    if search_result:
        return AdvancedPluralizationResult(plural, search_result.plural,
                                           suggestions,
                                           search_result.switched_ending_from,
                                           search_result.switched_ending_to,
                                           True)

    return AdvancedPluralizationResult(plural, None, (), None, None, False)
def singularize_advanced(
        plural: str,
        speller: Hunspell = None,
        ending_overrides: NounEndingMap = None
) -> AdvancedSingularizationResult:

    if not could_be_plural(plural, ending_overrides):
        return AdvancedSingularizationResult(None, None, (), False, False)

    options = __process_methods(
        plural,
        lambda l: singularize_by_hard_map(l, ending_overrides
                                          ),  # should always be first!
        singularize_oren,
        singularize_eren,
        singularize_by_latin,
        singularize_with_s,
        singularize_with_trema_en,
        singularize_with_en_single_vowel,
        singularize_with_en_double_vowel,
        singularize_with_en_double_consonant)

    # debug
    # print("options", options)

    if not speller:
        speller = ensure_hunspell_nl()

    stems = __stem(speller, plural)

    # return option that is spelled correct
    for option in options:
        if speller.spell(option):
            return AdvancedSingularizationResult(options, option, (), True,
                                                 True)

    stems = __stem(speller, plural)

    # debug
    # print("options", options, "stems", stems)

    if stems:
        return AdvancedSingularizationResult(options, stems[0], stems, True,
                                             True)

    for ending in ["'s", "s"]:
        if plural.endswith(ending):
            singular = plural[0:0 - len(ending)]
            return create_fallback_options(singular, speller, options, stems)

    if plural.endswith("en"):
        singular = plural[0:-2]

        if singular.endswith("v"):
            singular = singular[0:-1] + "f"
        elif singular.endswith("z"):
            singular = singular[0:-1] + "s"

        return create_fallback_options(singular, speller, options, stems)

    return AdvancedSingularizationResult(options, None, stems, False, True)
Exemple #14
0
class HunspellTest(unittest.TestCase):
    def setUp(self):
        self.h = Hunspell('test', hunspell_data_dir=DICT_DIR)

    def tearDown(self):
        try:
            del self.h
        except AttributeError:
            pass

    def assertAllIn(self, checked, expected):
        self.assertTrue(all(x in expected for x in checked),
                        u"{} not all found in {}".format(checked, expected))

    def test_create_destroy(self):
        del self.h

    def test_missing_dict(self):
        with self.assertRaises(IOError):
            Hunspell('not_avail', hunspell_data_dir=DICT_DIR)

    def test_spell(self):
        self.assertFalse(self.h.spell('dpg'))
        self.assertTrue(self.h.spell('dog'))

    def test_spell_utf8(self):
        self.assertTrue(self.h.spell(u'café'))
        self.assertFalse(self.h.spell(u'uncafé'))

    def test_spell_empty(self):
        self.assertTrue(self.h.spell(''))

    def test_suggest(self):
        required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg')
        suggest = self.h.suggest('dpg')
        self.assertIsInstance(suggest, tuple)
        self.assertAllIn(required, suggest)

    def test_suggest_utf8(self):
        required = (u'café', u'Cerf')
        for variant in ('cefé', u'cefé'):
            suggest = self.h.suggest(variant)
            self.assertIsInstance(suggest, tuple)
            self.assertAllIn(required, suggest)

    def test_suggest_empty(self):
        self.assertEqual(self.h.suggest(''), ())

    def test_stem(self):
        self.assertEqual(self.h.stem('dog'), ('dog', ))
        self.assertEqual(self.h.stem('permanently'), ('permanent', ))

    def test_bulk_suggest(self):
        self.h.set_concurrency(3)
        suggest = self.h.bulk_suggest(['dog', 'dpg'])
        self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg'])
        self.assertIsInstance(suggest['dog'], tuple)
        self.assertAllIn(('dog', ), suggest['dog'])

        required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg')
        self.assertIsInstance(suggest['dpg'], tuple)
        self.assertAllIn(required, suggest['dpg'])

        checked = [
            'bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre',
            'twg'
        ]
        suggest = self.h.bulk_suggest(checked)
        self.assertEqual(sorted(suggest.keys()), checked)

    def test_bulk_stem(self):
        self.h.set_concurrency(3)
        self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), {
            'permanently': ('permanent', ),
            'dog': ('dog', )
        })
        self.assertDictEqual(
            self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), {
                'unrecorded': ('recorded', ),
                'permanently': ('permanent', ),
                'twigs': ('twig', ),
                'dog': ('dog', )
            })
Exemple #15
0
class SpellChecker:
    """
    Class for managing spell checking using Hunspell. Implemented as a class, as multiple instances of a SpellChecker
    might be used to maintain different dictionaries simultaneously (for example adding custom words).
    """
    def __init__(self, allowed_punctuation_marks, dictionary_directory):
        """
        Constructor method. Declares and creates a new Hunspell object.
        """
        self.allowed_punctuation_marks = allowed_punctuation_marks
        self.dictionary_directory = dictionary_directory
        self.hunspell = None
        self.refresh_dict()

    def refresh_dict(self):
        """
        Create a new Hunspell object from the specified dictionary file.
        """
        self.hunspell = Hunspell('index',
                                 hunspell_data_dir=self.dictionary_directory)

    def is_punctuation_mark(self, word):
        """
        Checks if the given word corresponds to one of the allowed punctuation marks.
        :param word: a string with a single word
        :type: string
        :return: boolean indicating if the given word is an allowed punctuation mark
        :type: boolean
        """
        return bool(re.match(r'[%s]' % self.allowed_punctuation_marks, word))

    def is_correctly_spelled(self, word):
        """
        Checks if the given word is correctly spelled.
        :param word: a string with a single word
        :type: string
        :return: boolean indicating if the spelling of the word is correct
        :type: boolean
        """
        return self.hunspell.spell(word)

    def suggest(self, word):
        """
        Suggest similar and correctly spelled alternatives for the given string. Orders Hunspell suggestions by
        edit distance.
        :param word: a string with a single word
        :type: string
        :return: a list of suggestions
        :type: list<string>
        """
        suggestions = self.hunspell.suggest(word)
        return sorted(suggestions,
                      key=lambda suggestion: edit_distance(word, suggestion))

    def fix(self, word):
        """
        Fixes the spelling of the given word.
        :param word: a string with a single word
        :type: string
        :return: the same word if correctly spelled or a punctuation mark, otherwise the top Hunspell suggestion.
        """
        return word if self.is_punctuation_mark(
            word) or self.is_correctly_spelled(word) else self.suggest(word)[0]

    def fix_text(self, text):
        """
        Fixes the spelling of a multi-worded phrase.
        :param text: the phrase string
        :type: string
        :return: the same phrase, with the spelling of each word fixed.
        """
        fixed_text = ' '.join([self.fix(word) for word in word_tokenize(text)])
        return re.sub(r' ([%s])' % self.allowed_punctuation_marks, r'\1',
                      fixed_text)  # remove spaces preceding punctuation
Exemple #16
0
class WordScapeSolver:
    def __init__(self):
        self.h = Hunspell("en_US", "en_US")

    def solve_wordscape_helper(self, valid_words, letters, length):
        p = set(itertools.permutations(letters, length))

        for raw_string in p:
            word = "".join(raw_string)
            if self.h.spell(word):
                valid_words.append(word)

    def try_fit(self, board, i, j, horizontal, word, length, valid_words):
        if horizontal:
            for k in range(length):
                if board.content[i][j + k].c == ".":
                    board.content[i][j + k].save(True)
                    board.content[i][j + k].c = word[k]
                elif board.content[i][j + k].c == word[k]:
                    board.content[i][j + k].save(False)
                else:
                    for l in range(k):
                        board.content[i][j + l].restore()
                    return False

            new_word_list = valid_words.copy()
            new_word_list.remove(word)
            if verbose:
                print(board)
            success = self.try_solve(board, i, j, new_word_list)
            if not success:
                for l in range(length):
                    board.content[i][j + l].restore()
                return False
            else:
                return True

        else:
            for k in range(length):
                if board.content[i + k][j].c == ".":
                    board.content[i + k][j].save(True)
                    board.content[i + k][j].c = word[k]
                elif board.content[i + k][j].c == word[k]:
                    board.content[i + k][j].save(False)
                else:
                    for l in range(k):
                        board.content[i + l][j].restore()
                    return False

            new_word_list = valid_words.copy()
            new_word_list.remove(word)
            if verbose:
                print(board)
            success = self.try_solve(board, i, j, new_word_list)
            if not success:
                for l in range(length):
                    board.content[i + l][j].restore()
                return False
            else:
                return True

    def find_unfilled_word(self, board, i, j, horizontal):
        empty_spot = False

        if horizontal:
            length = 0
            for k in range(j, MAX_BOARD_SIZE):
                if not empty_spot and board.content[i][k].c == ".":
                    empty_spot = True

                if board.content[i][k].c == "":
                    exist = empty_spot and length > 1
                    return exist, length
                length += 1
        else:
            length = 0
            for k in range(i, MAX_BOARD_SIZE):
                if not empty_spot and board.content[k][j].c == ".":
                    empty_spot = True

                if board.content[k][j].c == "":
                    exist = empty_spot and length > 1
                    return exist, length
                length += 1

        return False, 0

    def find_next_word(self, board, i, j):
        for p in range(MAX_BOARD_SIZE):
            if p < i:
                continue

            for q in range(MAX_BOARD_SIZE):
                if p == i and q < j:
                    continue
                exist, length = self.find_unfilled_word(board, p, q, True)
                if exist:
                    return False, p, q, True, length
                exist, length = self.find_unfilled_word(board, p, q, False)
                if exist:
                    return False, p, q, False, length
        return True, 0, 0, True, 0

    def try_solve(self, board, i, j, valid_words):
        finished, i, j, horizontal, length = self.find_next_word(board, i, j)

        if finished:
            return True

        candidates = []
        for word in valid_words:
            if len(word) == length:
                candidates.append(word)

        while len(candidates) > 0:
            if self.try_fit(board, i, j, horizontal, candidates.pop(), length,
                            valid_words):
                return True

        return False

    def solve_board(self, valid_words, board):
        return self.try_solve(board, 0, 0, valid_words)

    def solve(self, letters, length, board=None):
        letters = letters.lower()
        valid_words = []
        for i in range(length, len(letters) + 1):
            self.solve_wordscape_helper(valid_words, letters, i)

        for word in valid_words:
            print(word)

        #valid_words =
        if not board is None:
            if self.solve_board(valid_words, board):
                print(board)
            else:
                print("No Solutions!")
     dtype={
         'Id': str,
         'EssaySet': str,
         'essay_score1': np.int32,
         'essay_score2': np.int32,
         'EssayText': str
     })
 df['error rate'] = np.nan
 for index, row in df.iterrows():
     if type(row['EssayText']) is float:
         df.at[index, 'error rate'] = 100
         continue
     token_list = row['EssayText'].split()
     count_error = 0
     for t in token_list:
         if spell.spell(t) is False:
             count_error += 1
     if count_error != 0:
         df.at[index,
               'error rate'] = count_error / len(token_list) * 100
     else:
         df.at[index, 'error rate'] = 0
 random_char = df.iloc[:1000]
 random_word = df.iloc[1000:2000]
 brown_char_ngram = df.iloc[2000:7000]
 brown_word_ngram = df.iloc[7000:12000]
 asap_char_ngram = df.iloc[12000:17000]
 asap_word_ngram = df.iloc[17000:22000]
 content_burst = df.iloc[22000:23000]
 shuffle = df.iloc[23000:24000]
 gpt_2 = df.iloc[24000:25001]
Exemple #18
0
 def test_hunspell_spell(self):
     d = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
     self.assertFalse(d.spell('dpg'))
     self.assertTrue(d.spell('dog'))
     del d
 def test_hunspell_spell(self):
     d = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
     self.assertFalse(d.spell('dpg'))
     self.assertTrue(d.spell('dog'))
     del d
Exemple #20
0
class UnsupervisedGrammarCorrector:
    def __init__(self, threshold=0.96):
        basename = os.path.dirname(os.path.realpath(__file__))
        self.lm = LanguageModel()
        # Load spaCy
        self.nlp = spacy.load("en")
        # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell
        # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower.
        self.gb = Hunspell("en_GB-large",
                           hunspell_data_dir=basename + '/resources/spelling/')
        # Inflection forms: http://wordlist.aspell.net/other/
        self.gb_infl = loadWordFormDict(basename +
                                        "/resources/agid-2016.01.19/infl.txt")
        # List of common determiners
        self.determiners = {"", "the", "a", "an"}
        # List of common prepositions
        self.prepositions = {
            "", "about", "at", "by", "for", "from", "in", "of", "on", "to",
            "with"
        }
        self.threshold = threshold

    def correct(self, sentence):
        # If the line is empty, preserve the newline in output and continue
        if not sentence:
            return ""
        best = sentence
        score = self.lm.score(best)

        while True:
            new_best, new_score = self.process(best)
            if new_best and new_score > score:
                best = new_best
                score = new_score
            else:
                break

        return best

    def process(self, sentence: str) -> Tuple[str, bool]:
        # Process sent with spacy
        proc_sent = self.nlp.tokenizer(sentence)
        self.nlp.tagger(proc_sent)
        # Calculate avg token prob of the sent so far.
        orig_prob = self.lm.score(proc_sent.text)
        # Store all the candidate corrected sentences here
        candidates = []
        # Process each token.
        for tok in proc_sent:
            # SPELLCHECKING
            # Spell check: tok must be alphabetical and not a real word.

            candidate_tokens = set()

            lower_cased_token = tok.lower_

            if lower_cased_token.isalpha(
            ) and not self.gb.spell(lower_cased_token):
                candidate_tokens |= set(self.gb.suggest(lower_cased_token))
            # MORPHOLOGY
            if tok.lemma_ in self.gb_infl:
                candidate_tokens |= self.gb_infl[tok.lemma_]
            # DETERMINERS
            if lower_cased_token in self.determiners:
                candidate_tokens |= self.determiners
            # PREPOSITIONS
            if lower_cased_token in self.prepositions:
                candidate_tokens |= self.prepositions

            candidate_tokens = [
                c for c in candidate_tokens if self.gb.spell(c)
            ]

            if candidate_tokens:
                if tok.is_title:
                    candidate_tokens = [c.title() for c in candidate_tokens]
                elif tok.is_upper:
                    candidate_tokens = [c.upper() for c in candidate_tokens]

                candidates.extend(
                    self._generate_candidates(tok.i, candidate_tokens,
                                              proc_sent))

        best_prob = orig_prob
        best = sentence

        for candidate in candidates:
            # Score the candidate sentence
            cand_prob = self.lm.score(candidate.text)
            print(candidate.text, self.lm.score(candidate.text), cand_prob)

            # Compare cand_prob against weighted orig_prob and best_prob
            if cand_prob > best_prob:
                best_prob = cand_prob
                best = candidate.text
        # Return the best sentence and a boolean whether to search for more errors
        return best, best_prob

    def _generate_candidates(self, tok_id, candidate_tokens,
                             tokenized_sentence) -> List[str]:
        # Save candidates here.
        candidates = []

        prefix = tokenized_sentence[:tok_id]
        suffix = tokenized_sentence[tok_id + 1:]
        # Loop through the input alternative candidates
        for token in candidate_tokens:
            candidate = prefix.text_with_ws
            if token:
                candidate += token + " "
            candidate += suffix.text_with_ws
            candidate = self.nlp.tokenizer(candidate)
            candidates.append(candidate)
        return candidates
class HunspellTest(unittest.TestCase):
    def setUp(self):
        self.h = Hunspell('test', hunspell_data_dir=DICT_DIR)

    def tearDown(self):
        try:
            del self.h
        except AttributeError:
            pass

    def assertAllIn(self, checked, expected):
        self.assertTrue(all(x in expected for x in checked),
            u"{} not all found in {}".format(checked, expected))

    def test_create_destroy(self):
        del self.h

    def test_missing_dict(self):
        with self.assertRaises(IOError):
            Hunspell('not_avail', hunspell_data_dir=DICT_DIR)

    def test_spell(self):
        self.assertFalse(self.h.spell('dpg'))
        self.assertTrue(self.h.spell('dog'))

    def test_spell_utf8(self):
        self.assertTrue(self.h.spell(u'café'))
        self.assertFalse(self.h.spell(u'uncafé'))

    def test_spell_empty(self):
        self.assertTrue(self.h.spell(''))

    def test_suggest(self):
        required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg')
        suggest = self.h.suggest('dpg')
        self.assertIsInstance(suggest, tuple)
        self.assertAllIn(required, suggest)

    def test_suggest_utf8(self):
        required = (u'café', u'Cerf')
        for variant in ('cefé', u'cefé'):
            suggest = self.h.suggest(variant)
            self.assertIsInstance(suggest, tuple)
            self.assertAllIn(required, suggest)

    def test_suggest_empty(self):
        self.assertEqual(self.h.suggest(''), ())

    def test_stem(self):
        self.assertEqual(self.h.stem('dog'), ('dog',))
        self.assertEqual(self.h.stem('permanently'), ('permanent',))

    def test_bulk_suggest(self):
        self.h.set_concurrency(3)
        suggest = self.h.bulk_suggest(['dog', 'dpg'])
        self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg'])
        self.assertIsInstance(suggest['dog'], tuple)
        self.assertAllIn(('dog',), suggest['dog'])

        required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg')
        self.assertIsInstance(suggest['dpg'], tuple)
        self.assertAllIn(required, suggest['dpg'])

        checked = ['bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg']
        suggest = self.h.bulk_suggest(checked)
        self.assertEqual(sorted(suggest.keys()), checked)

    def test_bulk_stem(self):
        self.h.set_concurrency(3)
        self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), {
            'permanently': ('permanent',),
            'dog': ('dog',)
        })
        self.assertDictEqual(self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), {
            'unrecorded': ('recorded',),
            'permanently': ('permanent',),
            'twigs': ('twig',),
            'dog': ('dog',)
        })
class HunspellTest(unittest.TestCase):
    def assertRegexpSearch(self, *args, **kwargs):
        if PY3:
            self.assertRegex(*args, **kwargs)
        else:
            self.assertRegexpMatches(*args, **kwargs)

    def setUp(self):
        self.h = Hunspell('test', hunspell_data_dir=DICT_DIR)

    def tearDown(self):
        try:
            del self.h
        except AttributeError:
            pass

    def assertAllIn(self, checked, expected):
        self.assertTrue(all(x in expected for x in checked),
            u"{} not all found in {}".format(checked, expected))

    def test_create_destroy(self):
        del self.h

    def test_missing_dict(self):
        with self.assertRaises(HunspellFilePathError):
            Hunspell('not_avail', hunspell_data_dir=DICT_DIR)

    @patch('os.path.isfile', return_value=True)
    @patch('os.access', return_value=True)
    def test_bad_path_encoding(self, *mocks):
        if PY3:
            with self.assertRaises(HunspellFilePathError):
                Hunspell('not_checked',
                    hunspell_data_dir=u'bad/\udcc3/decoding')
        else:
            # Python 2 just make an illegal string instead of raising
            with captured_c_stderr_file() as caperr:
                Hunspell('not_checked',
                    hunspell_data_dir=u'bad/\udcc3/decoding')
                with open(caperr, 'r') as err:
                    self.assertRegexpSearch(err.read(), r'error:[^\n]*bad/[^\n]*/decoding')

    @patch('hunspell.hunspell.WIN32_LONG_PATH_PREFIX', '/not/valid')
    def test_windows_utf_8_encoding_applies_prefix(self, *mocks):
        with captured_c_stderr_file() as caperr:
            with patch("os.name", 'nt'):
                # If python file existance checks used prefix, this would raise a HunspellFilePathError
                Hunspell('test', system_encoding='UTF-8')
            with open(caperr, 'r') as err:
                # But the Hunspell library lookup had the prefix applied
                self.assertRegexpSearch(err.read(), r'error:[^\n]*/not/valid[^\n]*')

    def test_spell(self):
        self.assertFalse(self.h.spell('dpg'))
        self.assertTrue(self.h.spell('dog'))

    def test_spell_utf8(self):
        self.assertTrue(self.h.spell(u'café'))
        self.assertFalse(self.h.spell(u'uncafé'))

    def test_spell_empty(self):
        self.assertTrue(self.h.spell(''))

    def test_suggest(self):
        required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg')
        suggest = self.h.suggest('dpg')
        self.assertIsInstance(suggest, tuple)
        self.assertAllIn(required, suggest)

    def test_suggest_utf8(self):
        required = (u'café', u'Cerf')
        for variant in ('cefé', u'cefé'):
            suggest = self.h.suggest(variant)
            self.assertIsInstance(suggest, tuple)
            self.assertAllIn(required, suggest)

    def test_suggest_empty(self):
        self.assertEqual(self.h.suggest(''), ())

    def test_stem(self):
        self.assertEqual(self.h.stem('dog'), ('dog',))
        self.assertEqual(self.h.stem('permanently'), ('permanent',))

    def test_add(self):
        word = 'outofvocabularyword'
        self.assertEqual(self.h.spell(word), False)
        self.h.add(word)
        self.assertEqual(self.h.spell(word), True)
        typo = word + 'd'
        self.assertAllIn([word], self.h.suggest(typo))

    def test_bulk_suggest(self):
        self.h.set_concurrency(3)
        suggest = self.h.bulk_suggest(['dog', 'dpg'])
        self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg'])
        self.assertIsInstance(suggest['dog'], tuple)
        self.assertAllIn(('dog',), suggest['dog'])

        required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg')
        self.assertIsInstance(suggest['dpg'], tuple)
        self.assertAllIn(required, suggest['dpg'])

        checked = ['bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg']
        suggest = self.h.bulk_suggest(checked)
        self.assertEqual(sorted(suggest.keys()), checked)

    def test_bulk_stem(self):
        self.h.set_concurrency(3)
        self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), {
            'permanently': ('permanent',),
            'dog': ('dog',)
        })
        self.assertDictEqual(self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), {
            'unrecorded': ('recorded',),
            'permanently': ('permanent',),
            'twigs': ('twig',),
            'dog': ('dog',)
        })

    def test_non_overlapping_caches(self):
        test_suggest = self.h.suggest('testing')
        test_stem = self.h.stem('testing')

        self.h._suggest_cache['made-up'] = test_suggest
        self.assertEqual(self.h.suggest('made-up'), test_suggest)
        self.h._stem_cache['made-up'] = test_stem
        self.assertEqual(self.h.stem('made-up'), test_stem)

        h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
        self.assertNotEqual(h2.suggest('made-up'), test_suggest)
        self.assertNotEqual(h2.stem('made-up'), test_stem)

    def test_overlapping_caches(self):
        test_suggest = self.h.suggest('testing')
        test_stem = self.h.stem('testing')

        self.h._suggest_cache['made-up'] = test_suggest
        self.assertEqual(self.h.suggest('made-up'), test_suggest)
        self.h._stem_cache['made-up'] = test_stem
        self.assertEqual(self.h.stem('made-up'), test_stem)

        del self.h
        self.h = Hunspell('test', hunspell_data_dir=DICT_DIR)
        self.assertEqual(self.h.suggest('made-up'), test_suggest)
        self.assertEqual(self.h.stem('made-up'), test_stem)

    def test_save_caches_persistance(self):
        temp_dir = tempfile.mkdtemp()
        try:
            h1 = Hunspell('test',
                hunspell_data_dir=DICT_DIR,
                disk_cache_dir=temp_dir,
                cache_manager='disk_hun')
            test_suggest = h1.suggest('testing')
            test_stem = h1.stem('testing')

            h1._suggest_cache['made-up'] = test_suggest
            self.assertEqual(h1.suggest('made-up'), test_suggest)
            h1._stem_cache['made-up'] = test_stem
            self.assertEqual(h1.stem('made-up'), test_stem)

            h1.save_cache()
            del h1

            cacheman = get_cache_manager('disk_hun')
            cacheman.deregister_all_caches()
            self.assertEqual(len(cacheman.cache_by_name), 0)

            h2 = Hunspell('test',
                hunspell_data_dir=DICT_DIR,
                disk_cache_dir=temp_dir,
                cache_manager='disk_hun')

            self.assertNotEqual(len(h2._suggest_cache), 0)
            self.assertNotEqual(len(h2._stem_cache), 0)
            self.assertEqual(h2.suggest('made-up'), test_suggest)
            self.assertEqual(h2.stem('made-up'), test_stem)
        finally:
            shutil.rmtree(temp_dir) # Nuke temp content

    def test_clear_caches_persistance(self):
        temp_dir = tempfile.mkdtemp()
        try:
            h1 = Hunspell('test',
                hunspell_data_dir=DICT_DIR,
                disk_cache_dir=temp_dir,
                cache_manager='disk_hun')
            test_suggest = h1.suggest('testing')
            test_stem = h1.stem('testing')

            h1._suggest_cache['made-up'] = test_suggest
            self.assertEqual(h1.suggest('made-up'), test_suggest)
            h1._stem_cache['made-up'] = test_stem
            self.assertEqual(h1.stem('made-up'), test_stem)

            h1.save_cache()
            h1.clear_cache()
            del h1

            cacheman = get_cache_manager('disk_hun')
            cacheman.deregister_all_caches()
            self.assertEqual(len(cacheman.cache_by_name), 0)

            h2 = Hunspell('test',
                hunspell_data_dir=DICT_DIR,
                disk_cache_dir=temp_dir,
                cache_manager='disk_hun')

            self.assertEqual(len(h2._suggest_cache), 0)
            self.assertEqual(len(h2._stem_cache), 0)
            self.assertNotEqual(h2.suggest('made-up'), test_suggest)
            self.assertNotEqual(h2.stem('made-up'), test_stem)
        finally:
            shutil.rmtree(temp_dir) # Nuke temp content

    def test_clear_caches_non_peristance(self):
        test_suggest = self.h.suggest('testing')
        test_stem = self.h.stem('testing')

        self.h._suggest_cache['made-up'] = test_suggest
        self.assertEqual(self.h.suggest('made-up'), test_suggest)
        self.h._stem_cache['made-up'] = test_stem
        self.assertEqual(self.h.stem('made-up'), test_stem)

        self.h.clear_cache()

        del self.h
        self.h = Hunspell('test', hunspell_data_dir=DICT_DIR)
        self.assertNotEqual(self.h.suggest('made-up'), test_suggest)
        self.assertNotEqual(self.h.stem('made-up'), test_stem)
Exemple #23
0
class Stem():
    """The Stem class deals with various tasks as follows:
        - spell error detection and correction
        - morphological analysis
        - stemming

        These tasks are carried out in the `Kurdish Hunspell project <https://github.com/sinaahmadi/KurdishHunspell>`_.

    """
    def __init__(self, dialect, script):
        self.hunspell_flags = {
            "po": "pos",
            "is": "description",
            "ts": "terminal_suffix",
            "ds": "formation"
        }
        if dialect == "Sorani" and script == "Arabic":
            self.huns = Hunspell("ckb-Arab",
                                 hunspell_data_dir=klpt.get_data("data/"))
        else:
            raise Exception(
                "Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!"
            )

    # def stem(self, word):
    #     """A function for stemming a single word"""
    #     pass

    # def lemmatize(self, word):
    #     """A function for lemmatization of a single word"""
    #     pass

    def check_spelling(self, word):
        """Check spelling of a word

        Args:
            word (str): input word to be spell-checked

        Raises:
            TypeError: only string as input

        Returns:
            bool: True if the spelling is correct, False if the spelling is incorrect
        """
        if not isinstance(word, str):
            raise TypeError("Only a word (str) is allowed.")
        else:
            return self.huns.spell(word)

    def correct_spelling(self, word):
        """Correct spelling errors if the input word is incorrect

        Args:
            word (str): input word to be spell-checked

        Raises:
            TypeError: only string as input

        Returns:
            tuple (boolean, list): a tuple where the first element indicates the correctness of the word (True if correct, False if incorrect).
            If the input word is incorrect, suggestions are provided in a list as the second element of the tuple, as (False, []).
            If no suggestion is available, the list is returned empty as (True, []).
        """
        if not isinstance(word, str):
            raise TypeError("Only a word (str) is allowed.")
        else:
            if self.check_spelling(word):
                return (True, [])
            return (False, list(self.huns.suggest(word)))

    def analyze(self, word_form):
        """Morphological analysis of a given word
        More details regarding Kurdish morphological analysis can be found at https://github.com/sinaahmadi/KurdishHunspell

        Args:
            word_form (str): a single word-form

        Raises:
            TypeError: only string as input

        Returns:
            (list(dict)): a list of all possible morphological analyses according to the defined morphological rules
            
            The morphological analysis is returned as a dictionary as follows:
             - "pos": the part-of-speech of the word-form according to `the Universal Dependency tag set <https://universaldependencies.org/u/pos/index.html>`_ 
             - "description": is flag
             - "terminal_suffix": anything except ts flag
             - "formation": if ds flag is set, its value is assigned to description and the value of formation is set to derivational. Although the majority of our morphological rules cover inflectional forms, it is not accurate to say all of them are inflectional. Therefore, we only set this value to derivational wherever we are sure.
             - "base": `ts` flag. The definition of terminal suffix is a bit tricky in Hunspell. According to `the Hunspell documentation <http://manpages.ubuntu.com/manpages/trusty/en/man4/hunspell.4.html>`_, "Terminal suffix fields are inflectional suffix fields "removed" by additional (not terminal) suffixes". In other words, the ts flag in Hunspell represents whatever is left after stripping all affixes. Therefore, it is the morphological base.

            If the input cannot be analyzed morphologically, an empty list is returned.
        """
        if not isinstance(word_form, str):
            raise TypeError("Only a word (str) is allowed.")
        else:
            # Given the morphological analysis of a word-form with Hunspell flags, extract relevant information and return a dictionary
            word_analysis = list()
            for analysis in list(self.huns.analyze(word_form)):
                analysis_dict = dict()
                for item in analysis.split():
                    if ":" not in item:
                        continue
                    if item.split(":")[1] == "ts":
                        # ts flag exceptionally appears after the value as value:key in the Hunspell output
                        analysis_dict["base"] = item.split(":")[0]
                        # anything except the terminal_suffix is considered to be the base
                        analysis_dict[self.hunspell_flags[item.split(
                            ":")[1]]] = word_form.replace(
                                item.split(":")[0], "")
                    elif item.split(":")[0] in self.hunspell_flags.keys():
                        # assign the key:value pairs from the Hunspell string output to the dictionary output of the current function
                        # for ds flag, add derivation as the formation type, otherwise inflection
                        if item.split(":")[0] == "ds":
                            analysis_dict[self.hunspell_flags[item.split(
                                ":")[0]]] = "derivational"
                            analysis_dict[
                                self.hunspell_flags["is"]] = item.split(":")[1]
                        else:
                            analysis_dict[self.hunspell_flags[item.split(
                                ":")[0]]] = item.split(":")[1]

                # if there is no value assigned to the ts flag, the terminal suffix is a zero-morpheme 0
                if self.hunspell_flags[
                        "ts"] not in analysis_dict or analysis_dict[
                            self.hunspell_flags["ts"]] == "":
                    analysis_dict[self.hunspell_flags["ts"]] = "0"

                word_analysis.append(analysis_dict)

        return word_analysis
Exemple #24
0

def clean_message(message, replacement):
    m = message
    m = re.sub(
        r'\s*(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\s*',
        replacement, m)
    return m


# Removes highway numbers
# Removes hyphens for times
# Removes references to 7/11 (screws up a lot of current time & date detection)
# Removes spaces between number and am/pm (helps with time detection)
def process_extra(tokens):
    t = tokens
    t = [token for token in t if not re.match(r'(400|401|403|404|407)', token)]
    t = [re.sub(r'(\d+)-[\d:]+', r'\1', token) for token in t]
    m = ' '.join(t)
    m = re.sub(r' (7 11|7-11)', r"", m)
    m = re.sub(r"([0-9]) (am|pm)", r"\1\2", m)
    return m.split(' ')


if __name__ == "__main__":
    print(
        process(
            "Offering: Jun.3 Sunday 8pm Waterloo (Burger King) -> Mississauga Square One $10/Pearson Airport $40, text 5197211776"
        ))
    print(hspell.spell('rhill'))
from hunspell import Hunspell

h = Hunspell("ko", hunspell_data_dir='ko')

if __name__ == "__main__":
    answer = h.spell("안녕하세요으")

    print(answer)
    answer2 = h.spell("안녕하세")
    print(answer2)

    answer3 = h.suggest("안녕하세요으")
    print(answer3)
Exemple #26
0
class Stem:
    """

    The Stem module deals with various tasks, mainly through the following functions:
        - `check_spelling`: spell error detection
        - `correct_spelling`: spell error correction
        - `analyze`: morphological analysis

    Please note that only Sorani is supported in this version in this module. The module is based on the [Kurdish Hunspell project](https://github.com/sinaahmadi/KurdishHunspell).

    Example:
    ```python
    >>> from klpt.stem import Stem
    >>> stemmer = Stem("Sorani", "Arabic")
    >>> stemmer.check_spelling("سوتاندبووت")
    False
    >>> stemmer.correct_spelling("سوتاندبووت")
    (False, ['ستاندبووت', 'سووتاندبووت', 'سووڕاندبووت', 'ڕووتاندبووت', 'فەوتاندبووت', 'بووژاندبووت'])
    >>> stemmer.analyze("دیتبامن")
    [{'pos': 'verb', 'description': 'past_stem_transitive_active', 'base': 'دیت', 'terminal_suffix': 'بامن'}]
    ```

    """

    def __init__(self, dialect, script):

        self.dialect = dialect
        self.script = script 

        self.hunspell_flags = {"po": "pos", "is": "description", "ts": "terminal_suffix", "ds": "formation"}
        if self.dialect == "Sorani" and self.script == "Arabic":
            self.huns = Hunspell("ckb-Arab", hunspell_data_dir=klpt.get_data("data/"))
        else:
            if not (self.dialect == "Kurmanji" and self.script == "Latin"):
                raise Exception("Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!")

    # def stem(self, word):
    #     """A function for stemming a single word"""
    #     pass

    # def lemmatize(self, word):
    #     """A function for lemmatization of a single word"""
    #     pass

    def check_spelling(self, word):
        """Check spelling of a word

        Args:
            word (str): input word to be spell-checked

        Raises:
            TypeError: only string as input

        Returns:
            bool: True if the spelling is correct, False if the spelling is incorrect
        """
        if not isinstance(word, str) or not (self.dialect == "Sorani" and self.script == "Arabic"):
            raise TypeError("Not supported yet.")
        else:
            return self.huns.spell(word)

    def correct_spelling(self, word):
        """
        Correct spelling errors if the input word is incorrect. It returns a tuple where the first element indicates the correctness of the word (True if correct, False if incorrect).
            If the input word is incorrect, suggestions are provided in a list as the second element of the tuple, as (False, []).
            If no suggestion is available, the list is returned empty as (True, []).

        Args:
            word (str): input word to be spell-checked

        Raises:
            TypeError: only string as input

        Returns:
            tuple (boolean, list)

        """
        if not isinstance(word, str) or not (self.dialect == "Sorani" and self.script == "Arabic"):
            raise TypeError("Not supported yet.")
        else:
            if self.check_spelling(word):
                return (True, [])
            return (False, list(self.huns.suggest(word)))

    def analyze(self, word_form):
        """
        Morphological analysis of a given word.
        
        It returns morphological analyses. The morphological analysis is returned as a dictionary as follows:
        
        - "pos": the part-of-speech of the word-form according to [the Universal Dependency tag set](https://universaldependencies.org/u/pos/index.html). 
        - "description": is flag
        - "terminal_suffix": anything except ts flag
        - "formation": if ds flag is set, its value is assigned to description and the value of formation is set to derivational. Although the majority of our morphological rules cover inflectional forms, it is not accurate to say all of them are inflectional. Therefore, we only set this value to derivational wherever we are sure.
        - "base": `ts` flag. The definition of terminal suffix is a bit tricky in Hunspell. According to [the Hunspell documentation](http://manpages.ubuntu.com/manpages/trusty/en/man4/hunspell.4.html), "Terminal suffix fields are inflectional suffix fields "removed" by additional (not terminal) suffixes". In other words, the ts flag in Hunspell represents whatever is left after stripping all affixes. Therefore, it is the morphological base.

        As in [{'pos': 'verb', 'description': 'past_stem_transitive_active', 'base': 'دیت', 'terminal_suffix': 'بامن'}]
        If the input cannot be analyzed morphologically, an empty list is returned.

        Sorani: 
        More details regarding Sorani Kurdish morphological analysis can be found at [https://github.com/sinaahmadi/KurdishHunspell](https://github.com/sinaahmadi/KurdishHunspell).

        Kurmanji:
        Regarding Kurmanji, we use the morphological analyzer provided by the [Kurmanji part](https://github.com/apertium/apertium-kmr)

        Please note that there are delicate difference between who the analyzers work in Hunspell and Apertium. For instane, the `base` in the Kurmanji analysis refers to the lemma while in Sorani (from Hunspell), it refers to the morphological base.

        Args:
            word_form (str): a single word-form

        Raises:
            TypeError: only string as input

        Returns:
            (list(dict)): a list of all possible morphological analyses according to the defined morphological rules
            
        """
        if not isinstance(word_form, str):
            raise TypeError("Only a word (str) is allowed.")
        else:
            word_analysis = list()
            if self.dialect == "Sorani" and self.script == "Arabic":
                # Given the morphological analysis of a word-form with Hunspell flags, extract relevant information and return a dictionary
                for analysis in list(self.huns.analyze(word_form)):
                    analysis_dict = dict()
                    for item in analysis.split():
                        if ":" not in item:
                            continue
                        if item.split(":")[1] == "ts":
                            # ts flag exceptionally appears after the value as value:key in the Hunspell output
                            analysis_dict["base"] = item.split(":")[0]
                            # anything except the terminal_suffix is considered to be the base
                            analysis_dict[self.hunspell_flags[item.split(":")[1]]] = word_form.replace(item.split(":")[0], "")
                        elif item.split(":")[0] in self.hunspell_flags.keys():
                            # assign the key:value pairs from the Hunspell string output to the dictionary output of the current function
                            # for ds flag, add derivation as the formation type, otherwise inflection
                            if item.split(":")[0] == "ds":
                                analysis_dict[self.hunspell_flags[item.split(":")[0]]] = "derivational"
                                analysis_dict[self.hunspell_flags["is"]] = item.split(":")[1]
                            else:
                                analysis_dict[self.hunspell_flags[item.split(":")[0]]] = item.split(":")[1]

                    # if there is no value assigned to the ts flag, the terminal suffix is a zero-morpheme 0
                    if self.hunspell_flags["ts"] not in analysis_dict or analysis_dict[self.hunspell_flags["ts"]] == "":
                        analysis_dict[self.hunspell_flags["ts"]] = "0"

                    word_analysis.append(analysis_dict)

            elif self.dialect == "Kurmanji" and self.script == "Latin":
                att_analysis = Analysis("Kurmanji", "Latin").analyze(word_form)
                # check if the word-form is analyzed or no
                if not len(att_analysis[1]):
                    # the word-form could not be analyzed
                    return []

                for form_analysis in list(att_analysis[-1]):
                    for analysis in form_analysis:
                        analysis_dict = dict()
                        structure = analysis[0].rsplit('@', 1)[1].split("<", 1)
                        analysis_dict["base"], analysis_dict["description"] = structure[0], structure[1].replace("><", "_").replace(">", "").strip()
                        analysis_dict["pos"] = ""
                        analysis_dict["terminal_suffix"] = ""
                        analysis_dict["formation"] = ""
                        # TODO: the description needs further information extraction in such a way that some values should be assigned to the "pos" key 
                        # analysis_dict["terminal_suffix"] = word_form.replace(analysis_dict["base"], "")
                        word_analysis.append(analysis_dict)

        return word_analysis
Exemple #27
0
class CyHunspell():
    '''
    Спеллер на основе cython версии hunspell
    
    >>> word_en = 'cookbok'
    >>> word_ru = 'поваринная'
    >>> speller_en = CyHunspell(lang="en")
    >>> speller_en.spell(word_en)
    False
    >>> speller_en.suggest(word_en)
    ('cookbook', 'copybook', 'codebook', 'Cook', 'cook')
    >>> speller_en.replace(word_en)
    'cookbook'
    >>> speller_ru = CyHunspell(lang="ru")
    >>> speller_ru.spell(word_ru)
    False
    >>> speller_ru.suggest(word_ru)
    ('поваренная',)
    >>> speller_ru.replace(word_ru)
    'поваренная'
    '''

    langs = {'ru': 'ru_RU', 'en': 'en_US'}

    def __init__(
        self,
        lang='en',
        max_dist=2,
        cpu=os.cpu_count(),
        # cache_manager="hunspell",disk_cache_dir=None,
        # hunspell_data_dir=None,system_encoding=None
        spell_kwargs={}):

        self.lang = self.langs.get(lang, lang)
        self.spell_dict = Hunspell(self.lang, **spell_kwargs)
        self.max_dist = max_dist
        self.spell_dict.set_concurrency(cpu)

    def spell(self, word):

        try:
            result = self.spell_dict.spell(word)
        except UnicodeEncodeError as err:
            result = None
        return result

    def suggest(self, word):

        try:
            result = self.spell_dict.suggest(word)
        except UnicodeEncodeError as err:
            result = tuple()
        return result

    def replace(self, word, max_dist=None):
        max_dist = max_dist if max_dist is not None else self.max_dist

        if self.spell(word):
            return word
        suggestions = self.suggest(word)
        if (suggestions and edit_distance(word, suggestions[0]) <= max_dist):
            return suggestions[0]
        else:
            return word