def preprocess(self, corpus): ''' This iterates through the corpus line by line tokenizing, spellchecking, normalizing abbreviations, etc. It's debatable how you want to tokenize, I chose to do it by sentence but it could be done with regex like so "[\w' ]+" which would give you smaller phrases. I: list of text strings O: preprocessed list of text strings ''' sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') #lowercase and tokenize by sentence corpus_formatted = [tokenized_line for line in corpus \ for tokenized_line in sent_detector.tokenize(line.strip().lower())] #consider re-designing to a helper function that does multiple manipulations to a string #expanding the abreviations corpus_formatted_expanded = [multiple_replace(line) for line in corpus_formatted] #spell check *note this function takes quite a bit of time corpus_formatted_expanded_correct = [' '.join([spell_checker.correct(word) for word in line.strip().split()]) for line in corpus_formatted_expanded] #*FIX HACK #further refinement corpus_formatted_expanded_correct = [add_question_mark_or_period_to_sentence(line) for line in corpus_formatted_expanded_correct] return [custome_refine(line) for line in corpus_formatted_expanded_correct]
def preprocess(text, twit=False): """ Preprocess text for sentiment analysis """ tagger = twit_pos if twit else postag toks, pos = tagger(text) tokens = [spell.correct(t) for t in toks] pairs = map(list, zip(tokens,pos)) return reduce_form(pairs)
def spellcheck(raw): """ Performs spell correction on the input """ l = [] sents = nltk.tokenize.sent_tokenize(raw) for sent in sents: words = nltk.word_tokenize(sent) for word in words: l.append(spell.correct(word)) return ' '.join(l)
def spell_check(l): """ Return a list of spell-checked, pos-tagged tweets in the format of [tokens, pos] """ print 'spellchecking ' ret = [] for tweet in l: tokens = [] for t in tweet[0]: tokens.append(s.correct(t)) ret.append([tokens, tweet[1]]) print 'done spellchecking' return ret
def preprocess(self, corpus): ''' This iterates through the corpus line by line tokenizing, spellchecking, normalizing abbreviations, etc. It's debatable how you want to tokenize, I chose to do it by sentence but it could be done with regex like so "[\w' ]+" which would give you smaller phrases. I: list of text strings O: preprocessed list of text strings ''' sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') #lowercase and tokenize by sentence corpus_formatted = [tokenized_line for line in corpus \ for tokenized_line in sent_detector.tokenize(line.strip().lower())] #consider re-designing to a helper function that does multiple manipulations to a string #expanding the abreviations corpus_formatted_expanded = [ multiple_replace(line) for line in corpus_formatted ] #spell check *note this function takes quite a bit of time corpus_formatted_expanded_correct = [ ' '.join( [spell_checker.correct(word) for word in line.strip().split()]) for line in corpus_formatted_expanded ] #*FIX HACK #further refinement corpus_formatted_expanded_correct = [ add_question_mark_or_period_to_sentence(line) for line in corpus_formatted_expanded_correct ] return [ custome_refine(line) for line in corpus_formatted_expanded_correct ]
def predict(self, e): s = spell_checker.correct(self.editname.GetValue()) self.result.SetLabel(s)
def predict(self, e): s=spell_checker.correct(self.editname.GetValue()) self.result.SetLabel(s)
#!/usr/bin/python import spell_checker file = 'commonmisspelled.txt' f = open(file) error = [] correct = [] for line in f: error.append(line.split()[0]) correct.append(line.split()[1]) meanReciprocalRank = 0 avgTime = 0.0 for i in range(len(error)): suggestions = spell_checker.correct(error[i]) #avgTime += time reciprocalRank = 0.0 for j in range(5): if suggestions[j][0] == correct[i]: reciprocalRank = 1.0/(j+1) break meanReciprocalRank += reciprocalRank print i #print float(avgTime)/len(error) print float(meanReciprocalRank)/len(error)
def GET(self,name): user_data = web.input() return correct(user_data.text.replace('sapiAGRI ',''))
def test(input_file_name,output_file_name): file = open(input_file_name) data = file.read() element = ET.XML(data) outfile = open(output_file_name,"w") # regex for single word. regex_single = re.compile('\w+') # regex for phrase. regex_phrase = re.compile('\w+|[,|:|(|)|\'|\"]') # creating the regex obj. regex = re.compile('\w+|[.|,|:|(|)|\'|\"]') for subelement in element: if(subelement.attrib['id'][0] == '1'): # testcase1.xml outfile.write(subelement.attrib['id']) single_word = regex_single.findall(subelement.text) suggestions = spell_checker.correct(single_word[0]) # printing the misspelled word. outfile.write(", "+single_word[0]) i = 1 # printing the suggestions. for each_suggestion in suggestions: outfile.write(", ") outfile.write(each_suggestion[0]) i += 1 if( i > MAX_SUGGESTIONS): outfile.write("\n") break elif( subelement.attrib['id'][0] == '2'): # testcase2.xml words_in_phrase = regex_phrase.findall( subelement.text) # identifying the incorrect words in the phrase. i = 0; for i in xrange(0,len(words_in_phrase)): # if it is a punctuations then we should not query the dictionary. if( isPunctuation(words_in_phrase[i])): continue suggestions = spell_checker.correct(words_in_phrase[i]) if( len(suggestions) == 1): # correct word. continue outfile.write(subelement.attrib['id']) incorrect_word = words_in_phrase[i] outfile.write(", "+incorrect_word) temp_list = [] for each_suggestion in suggestions[:MAX_UNIGRAM_SUGGESTIONS] : words_in_phrase[i] = each_suggestion[0] temp_list.append( ( each_suggestion[0], ngrams.getProbabilityOfPhrase(words_in_phrase,100,False))) temp_list.sort( key = getSecond, reverse=True) j = 1 # printing the suggestions. for each_temp in temp_list: outfile.write(", ") outfile.write(each_temp[0]) j += 1 if( j > MAX_SUGGESTIONS): outfile.write("\n") break # handling the testcases3 elif(subelement.attrib['id'][0] == '3'): # testcases3.xml sentence = subelement.text words_in_sentence = regex.findall(sentence) # TODO: dot is added here. # identifying the incorrect words in the sentence. i = 0; for i in xrange(0,len(words_in_sentence)): # if it is a punctuations then we should not query the dictionary. if( isPunctuation(words_in_sentence[i])): continue suggestions = spell_checker.correct(words_in_sentence[i]) if( len(suggestions) == 1): # correct word. continue # patch. for k in xrange(0,len(suggestions)): suggestions[k] = suggestions[k][0] outfile.write(subelement.attrib['id']) incorrect_word = words_in_sentence[i] outfile.write(", "+incorrect_word) # creating a new list to call the correctSentences function. aux_list = [] for t in xrange(0,(len(words_in_sentence)-1)): aux_list.append([words_in_sentence[t]]) aux_list[i] = suggestions[:MAX_SUGGESTIONS] print aux_list sorted_suggestions = pos_ngrams.correctSentences(aux_list) j = 1 # printing the suggestions. for each_sorted_suggestion in sorted_suggestions: outfile.write(", ") outfile.write(each_sorted_suggestion) j += 1 if( j > MAX_SUGGESTIONS): outfile.write("\n") break # handling the testcases4 elif(subelement.attrib['id'][0] == '4'): # testcases4.xml sentences = subelement.text.split('.') # dot is the sentence separator. for each_sentence in sentences: words_in_sentence = regex.findall(each_sentence) # a list is returned. # identifying the incorrect words in the sentence. i = 0; for i in xrange(0,len(words_in_sentence)): # if it is a punctuations then we should not query the dictionary. if( isPunctuation(words_in_sentence[i])): continue suggestions = spell_checker.correct(words_in_sentence[i]) if( len(suggestions) == 1): # correct word. continue # patch. for k in xrange(0,len(suggestions)): suggestions[k] = suggestions[k][0] outfile.write(subelement.attrib['id']) incorrect_word = words_in_sentence[i] outfile.write(", "+incorrect_word) # creating a new list to call the correctSentences function. aux_list = [] for t in xrange(0,len(words_in_sentence)): aux_list.append([words_in_sentence[t]]) aux_list[i] = suggestions[:MAX_SUGGESTIONS] print aux_list sorted_suggestions = pos_ngrams.correctSentences(aux_list) j = 1 # printing the suggestions. for each_sorted_suggestion in sorted_suggestions: outfile.write(", ") outfile.write(each_sorted_suggestion) j += 1 if( j > MAX_SUGGESTIONS): outfile.write("\n") break file.close() outfile.close()
def correct_spelling(line): return ' '.join( [spell_checker.correct(word) for word in line.strip().split()])
def cleanText(s): s = unicode(s).lower() if isinstance(s, unicode): s = correct(s) s = s.replace(" x "," xby ") s = s.replace("*"," xby ") s = s.replace(" by "," xby") s = s.replace("x0"," xby 0") s = s.replace("x1"," xby 1") s = s.replace("x2"," xby 2") s = s.replace("x3"," xby 3") s = s.replace("x4"," xby 4") s = s.replace("x5"," xby 5") s = s.replace("x6"," xby 6") s = s.replace("x7"," xby 7") s = s.replace("x8"," xby 8") s = s.replace("x9"," xby 9") s = s.replace("0x","0 xby ") s = s.replace("1x","1 xby ") s = s.replace("2x","2 xby ") s = s.replace("3x","3 xby ") s = s.replace("4x","4 xby ") s = s.replace("5x","5 xby ") s = s.replace("6x","6 xby ") s = s.replace("7x","7 xby ") s = s.replace("8x","8 xby ") s = s.replace("9x","9 xby ") s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s) s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s) s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s) s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s) s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s) s = re.sub(r"([0-9]+)( *)(degrees|degree)\.?", r"\1deg. ", s) s = re.sub(r"([0-9]+)( *)(volts|volt)\.?", r"\1volt. ", s) s = re.sub(r"([0-9]+)( *)(watts|watt)\.?", r"\1watt. ", s) s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1amp. ", s) s = s.replace("whirpool","whirlpool") s = s.replace("whirlpoolga", "whirlpool") s = s.replace("whirlpoolstainless","whirlpool stainless") s = s.replace(" "," ") s = re.sub(r"(\w)\.([A-Z])", r"\1 \2", s) #Split words with a.A s = s.lower() s = s.replace(" "," ") s = s.replace(",","") #could be number / segment later s = s.replace("$"," ") s = s.replace("?"," ") s = s.replace("-"," ") s = s.replace("//","/") s = s.replace("..",".") s = s.replace(" / "," ") s = s.replace(" \\ "," ") s = s.replace("."," . ") s = re.sub(r"(^\.|/)", r"", s) s = re.sub(r"(\.|/)$", r"", s) s = re.sub(r"([0-9])([a-z])", r"\1 \2", s) s = re.sub(r"([a-z])([0-9])", r"\1 \2", s) s = s.replace(" x "," xbi ") s = re.sub(r"([a-z])( *)\.( *)([a-z])", r"\1 \4", s) s = re.sub(r"([a-z])( *)/( *)([a-z])", r"\1 \4", s) s = s.replace("*"," xbi ") s = s.replace(" by "," xbi ") s = re.sub(r"([0-9])( *)\.( *)([0-9])", r"\1.\4", s) s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in. ", s) s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft. ", s) s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb. ", s) s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s) s = re.sub(r"([0-9]+)( *)(cubic|cu) ?\.?(feet|foot|ft)\.?", r"\1cu.ft. ", s) s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s) s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s) s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s) s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s) s = s.replace("°"," degrees ") s = re.sub(r"([0-9]+)( *)(degrees|degree)\.?", r"\1deg. ", s) s = s.replace(" v "," volts ") s = re.sub(r"([0-9]+)( *)(volts|volt)\.?", r"\1volt. ", s) s = re.sub(r"([0-9]+)( *)(watts|watt)\.?", r"\1watt. ", s) s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1amp. ", s) s = s.replace(" "," ") s = s.replace(" . "," ") s = s.replace("toliet","toilet") s = s.replace("airconditioner","air conditioner") s = s.replace("vinal","vinyl") s = s.replace("vynal","vinyl") s = s.replace("skill","skil") s = s.replace("snowbl","snow bl") s = s.replace("plexigla","plexi gla") s = s.replace("rustoleum","rust-oleum") s = s.replace("whirpool","whirlpool") s = s.replace("whirlpoolga", "whirlpool ga") s = s.replace("whirlpoolstainless","whirlpool stainless") s = (" ").join([str(strNum[z]) if z in strNum else z for z in s.split(" ")]) s = tokenizer.tokenize(s) s = [word for word in s if word not in stopwords.words('english')] return s
def correct_spelling(line): return ' '.join([spell_checker.correct(word) for word in line.strip().split()])
def str_correcter(s): return " ".join( [spell_checker.correct(word) for word in s.lower().split()])