""" The Spel spellchecker Spel reads sentences from standard input and prints a list of misspelled words and provides weighted candidate suggestions. Useful for testing the spellchecker. @author: Johannes H. Jensen <*****@*****.**> """ if __name__ == '__main__': # Import and create spellchecker from spellcheck import Spellchecker spellchecker = Spellchecker() lines = [] while True: try: lines.append(raw_input()) except EOFError: break text = ' '.join(lines) results = spellchecker.check(text) for r in results: print r.word + ':', ' '.join(r.candidates[:5])
def normalize_codemixed_text(source_file, language_file, lang_list, to_pickle=False): ''' :param source_file: file containing tweets :param lang_list: list of languages with which to condition the language identifier :return: text cleaned from #tags, RT, transliterated and spell-corrrected ''' # dumbFilter = dumbFilterCollection() head, inpFileName = os.path.split(source_file) fileName, ext = inpFileName.split(".") outFile = fileName + "_filtered" outFile = os.path.join(head, outFile + "." + ext) major_lang = lang_list[ 0] # TODO: For now this is english but could be french. Adapt the constructors accordingly. mixed_lang = lang_list[1] repkl_Major = repkl_Alt = to_pickle spellChecker = Spellchecker(mixedLang=mixed_lang, majorLang=major_lang, repklEng=repkl_Major, repklAlt=repkl_Alt, aggressiveness=1, outputType="firstOf", altPath=None, freqDocMajor=engDictPath, dictDocMixed=hinDictPath) # if the lines within this file are already language annotated isLangTagged = True with codecs.open(source_file, 'r', encoding='utf-8') as f_src,\ codecs.open(language_file, 'r', encoding='utf-8') as f_lang: with codecs.open(outFile, 'w', encoding='utf-8') as fw: for line, lids in zip(f_src, f_lang): # 1. Apply basic filtering # line = dumbFilter.filterLine(line) line = line.strip().split() lids = lids.strip().split() # 2. Zip together the Language Tag and the words lang_tagged_line = " ".join( [x + '$\\$' + y for x, y in zip(line, lids)]) print("Line observed:{}".format(lang_tagged_line)) # spell_corrected_line = " ".join(words) spell_corrected_line = spellChecker.correctSentence( lang_tagged_line) fw.write(spell_corrected_line) # # 3. Transliterate each word to their language specific script # translit_words = [] # # for word, lang in zip(spell_corrected_line.split(" "), lid_tags): # translit_words.append(indic_transliterator(word, "english", lang)) # # fw.write(" ".join(translit_words)) return outFile
#!/usr/bin/python ''' Created on Sep 14, 2009 Main script for running the spell checker. @author: Gabe Arnold <*****@*****.**> ''' from sys import argv from spellcheck import Spellchecker from input_output import InputParser, OutputGenerator if __name__ == '__main__': xml_file = argv[1] output_name = argv[2] fh = open(xml_file, 'r') input = InputParser(fh.read()) output = OutputGenerator(output_name) checker = Spellchecker() for case in input.get_cases(): corrections = checker.check(case['string']) output.append((case['id'], case['string'], corrections)) output.write()