Ejemplo n.º 1
0
def spell(word):
    """most likely correction for everything up to a double typo"""
    w = Word(word)
    candidates = (common([word]) or exact([word]) or known([word])
                  or known(w.typos()) or common(w.double_typos()) or [word])
    correction = max(candidates, key=lambda item: NLP_COUNTS[item])
    return get_case(word, correction)
Ejemplo n.º 2
0
def autocorrect(misspelled):
    """most likely correction for everything up to a double typo"""
    w = Word(misspelled)
    candidates = (common([misspelled]) or exact([misspelled]) or known([misspelled]) or
                    known(w.typos()) or common(w.double_typos()) or [misspelled])
    correction = max(candidates, key=NLP_COUNTS.get)
    return get_case(misspelled, correction)
Ejemplo n.º 3
0
def spell(word):
    """most likely correction for everything up to a double typo"""
    w = Word(word)
    candidates = (
        common([word]) or exact([word]) or known([word]) or known(w.typos())
        or common(w.double_typos())  #or [word]
    )
    correction = '<UNK>'
    if candidates:
        correction = max(candidates, key=NLP_COUNTS.get)
        correction = get_case(word, correction)
    return correction
Ejemplo n.º 4
0
def spell(word, lang_sample, file_format='bz'):
    from autocorrect.nlp_parser import parse
    from autocorrect.word import Word, common, exact, known, get_case
    """most likely correction for everything up to a double typo"""

    if file_format == 'bz':
        NLP_WORDS, NLP_COUNTS = parse('big.txt', 'bz')
    elif file_format == 'txt':
        NLP_WORDS, NLP_COUNTS = parse(lang_sample, 'txt')

    w = Word(word)
    candidates = (common([word], NLP_WORDS) or exact([word]) or known([word])
                  or known(w.typos()) or common(w.double_typos()) or [word])
    correction = max(candidates, key=NLP_COUNTS.get)
    return get_case(word, correction)
Ejemplo n.º 5
0
def spell(word, language='en'):
    """The language parameter takes into account of the language.
       most likely correction for everything up to a double typo"""
    if (language == 'en'):
        w = Word(word)
        candidates = (common([word]) or exact([word]) or known([word])
                      or known(w.typos()) or common(w.double_typos())
                      or [word])
        correction = max(candidates, key=NLP_COUNTS.get)
        return get_case(word, correction)
    elif (language == 'bn'):
        w = Word(word)
        candidates = (common([word]) or exact([word]) or known([word])
                      or known(w.typos()) or common(w.double_typos())
                      or [word])
        correction = max(candidates, key=NLP_COUNTS_BN.get)
        return get_case(word, correction)
    else:
        raise ValueError("This language is not supported")
Ejemplo n.º 6
0
def spelltest(tests, verbose=False):
    n, bad, unknown, start = 0, 0, 0, time.clock()
    for target, incorrect_spellings in tests.items():
        for incorrect_spelling in incorrect_spellings.split():
            n += 1
            w = spell(incorrect_spelling)
            if w != target:
                bad += 1
                if not known([target]):
                    unknown += 1
                if verbose:
                    print(MSG.format(incorrect_spelling, w, NLP_COUNTS[w],
                                     target, NLP_COUNTS[target]))
    return RESULT.format(bad, n, int(100. - 100. * bad / n), 
                         unknown, int(time.clock() - start))
Ejemplo n.º 7
0
def spelltest(tests, verbose=False):
    n, bad, unknown, start = 0, 0, 0, time.clock()
    for target, incorrect_spellings in tests.items():
        for incorrect_spelling in incorrect_spellings.split():
            n += 1
            w = spell(incorrect_spelling)
            if w != target:
                bad += 1
                if not known([target]):
                    unknown += 1
                if verbose:
                    print(
                        MSG.format(incorrect_spelling, w, NLP_COUNTS[w],
                                   target, NLP_COUNTS[target]))
    return RESULT.format(bad, n, int(100. - 100. * bad / n), unknown,
                         (time.clock() - start))