コード例 #1
0
ファイル: spel.py プロジェクト: okkhoy/gabe-and-joh
"""
The Spel spellchecker

Spel reads sentences from standard input and prints a list
of misspelled words and provides weighted candidate suggestions.

Useful for testing the spellchecker.

@author: Johannes H. Jensen <*****@*****.**>
"""

if __name__ == '__main__':
    # Import and create spellchecker
    from spellcheck import Spellchecker
    
    spellchecker = Spellchecker()
    
    lines = []
    while True:
        try:
            lines.append(raw_input())
        except EOFError:
            break
    
    text = ' '.join(lines)
    
    
    results = spellchecker.check(text)
    for r in results:
        print r.word + ':', ' '.join(r.candidates[:5])
    
コード例 #2
0
ファイル: normalize.py プロジェクト: myeditha/switchsand
def normalize_codemixed_text(source_file,
                             language_file,
                             lang_list,
                             to_pickle=False):
    '''
    :param source_file: file containing tweets
    :param lang_list: list of languages with which to condition the language identifier
    :return: text cleaned from #tags, RT, transliterated and spell-corrrected
    '''
    # dumbFilter = dumbFilterCollection()

    head, inpFileName = os.path.split(source_file)
    fileName, ext = inpFileName.split(".")
    outFile = fileName + "_filtered"
    outFile = os.path.join(head, outFile + "." + ext)

    major_lang = lang_list[
        0]  # TODO: For now this is english but could be french. Adapt the constructors accordingly.
    mixed_lang = lang_list[1]
    repkl_Major = repkl_Alt = to_pickle

    spellChecker = Spellchecker(mixedLang=mixed_lang,
                                majorLang=major_lang,
                                repklEng=repkl_Major,
                                repklAlt=repkl_Alt,
                                aggressiveness=1,
                                outputType="firstOf",
                                altPath=None,
                                freqDocMajor=engDictPath,
                                dictDocMixed=hinDictPath)

    # if the lines within this file are already language annotated
    isLangTagged = True

    with codecs.open(source_file, 'r', encoding='utf-8') as f_src,\
            codecs.open(language_file, 'r', encoding='utf-8') as f_lang:
        with codecs.open(outFile, 'w', encoding='utf-8') as fw:
            for line, lids in zip(f_src, f_lang):
                # 1. Apply basic filtering
                # line = dumbFilter.filterLine(line)
                line = line.strip().split()
                lids = lids.strip().split()
                # 2. Zip together the Language Tag and the words

                lang_tagged_line = " ".join(
                    [x + '$\\$' + y for x, y in zip(line, lids)])
                print("Line observed:{}".format(lang_tagged_line))

                # spell_corrected_line = " ".join(words)
                spell_corrected_line = spellChecker.correctSentence(
                    lang_tagged_line)
                fw.write(spell_corrected_line)
                # # 3. Transliterate each word to their language specific script
                # translit_words = []
                #
                # for word, lang in zip(spell_corrected_line.split(" "), lid_tags):
                #     translit_words.append(indic_transliterator(word, "english", lang))
                #
                # fw.write(" ".join(translit_words))

    return outFile
コード例 #3
0
ファイル: main.py プロジェクト: okkhoy/gabe-and-joh
#!/usr/bin/python
'''
Created on Sep 14, 2009

Main script for running the spell checker.
@author: Gabe Arnold <*****@*****.**>
'''

from sys import argv
from spellcheck import Spellchecker
from input_output import InputParser, OutputGenerator

if __name__ == '__main__':
    xml_file = argv[1]
    output_name = argv[2]
    
    fh = open(xml_file, 'r')
    input = InputParser(fh.read())
    output = OutputGenerator(output_name)
    
    checker = Spellchecker()
    
    for case in input.get_cases():
        corrections = checker.check(case['string'])
        output.append((case['id'], case['string'], corrections))

    output.write()