def preprocess(self, corpus):
        '''
        This iterates through the corpus line by line tokenizing, 
        spellchecking, normalizing abbreviations, etc.

        It's debatable how you want to tokenize, I chose to do it by sentence
        but it could be done with regex like so "[\w' ]+" which would give you 
        smaller phrases.
        
        I: list of text strings
        O: preprocessed list of text strings
        '''
        
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        
        #lowercase and tokenize by sentence
        corpus_formatted = [tokenized_line for line in corpus \
                            for tokenized_line in sent_detector.tokenize(line.strip().lower())]

        
        #consider re-designing to a helper function that does multiple manipulations to a string

        #expanding the abreviations
        corpus_formatted_expanded = [multiple_replace(line) for line in corpus_formatted]

        #spell check *note this function takes quite a bit of time
        corpus_formatted_expanded_correct = [' '.join([spell_checker.correct(word) for word in line.strip().split()]) 
                                             for line in corpus_formatted_expanded]
        
        #*FIX HACK
        #further refinement 
        corpus_formatted_expanded_correct = [add_question_mark_or_period_to_sentence(line) for line in corpus_formatted_expanded_correct]
        return [custome_refine(line) for line in corpus_formatted_expanded_correct]
Exemple #2
0
def preprocess(text, twit=False):
    """ Preprocess text for sentiment analysis
    """
    tagger = twit_pos if twit else postag
    toks, pos = tagger(text)
    tokens = [spell.correct(t) for t in toks]
    pairs = map(list, zip(tokens,pos))
    return reduce_form(pairs)
Exemple #3
0
def spellcheck(raw):
    """ Performs spell correction on the input
    """
    l = []
    sents = nltk.tokenize.sent_tokenize(raw)
    for sent in sents:
        words = nltk.word_tokenize(sent)
        for word in words:
            l.append(spell.correct(word))
    return ' '.join(l)
def spell_check(l):
    """
	Return a list of spell-checked, pos-tagged tweets in the format of [tokens, pos]
	"""
    print 'spellchecking '
    ret = []
    for tweet in l:
        tokens = []
        for t in tweet[0]:
            tokens.append(s.correct(t))
        ret.append([tokens, tweet[1]])
    print 'done spellchecking'
    return ret
def spell_check(l):
	"""
	Return a list of spell-checked, pos-tagged tweets in the format of [tokens, pos]
	"""
	print 'spellchecking '
	ret = []
	for tweet in l:
		tokens = []
		for t in tweet[0]:
			tokens.append(s.correct(t))
		ret.append([tokens, tweet[1]])
	print 'done spellchecking'
	return ret
    def preprocess(self, corpus):
        '''
        This iterates through the corpus line by line tokenizing, 
        spellchecking, normalizing abbreviations, etc.

        It's debatable how you want to tokenize, I chose to do it by sentence
        but it could be done with regex like so "[\w' ]+" which would give you 
        smaller phrases.
        
        I: list of text strings
        O: preprocessed list of text strings
        '''

        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

        #lowercase and tokenize by sentence
        corpus_formatted = [tokenized_line for line in corpus \
                            for tokenized_line in sent_detector.tokenize(line.strip().lower())]

        #consider re-designing to a helper function that does multiple manipulations to a string

        #expanding the abreviations
        corpus_formatted_expanded = [
            multiple_replace(line) for line in corpus_formatted
        ]

        #spell check *note this function takes quite a bit of time
        corpus_formatted_expanded_correct = [
            ' '.join(
                [spell_checker.correct(word) for word in line.strip().split()])
            for line in corpus_formatted_expanded
        ]

        #*FIX HACK
        #further refinement
        corpus_formatted_expanded_correct = [
            add_question_mark_or_period_to_sentence(line)
            for line in corpus_formatted_expanded_correct
        ]
        return [
            custome_refine(line) for line in corpus_formatted_expanded_correct
        ]
 def predict(self, e):
     s = spell_checker.correct(self.editname.GetValue())
     self.result.SetLabel(s)
 def predict(self, e):
     s=spell_checker.correct(self.editname.GetValue())
     self.result.SetLabel(s)
Exemple #9
0
#!/usr/bin/python
import spell_checker
file = 'commonmisspelled.txt'

f = open(file)
error = []
correct = []
for line in f:
    error.append(line.split()[0])
    correct.append(line.split()[1])

meanReciprocalRank = 0
avgTime = 0.0
for i in range(len(error)):
    suggestions = spell_checker.correct(error[i])
    #avgTime += time
    reciprocalRank = 0.0
    for j in range(5):
        if suggestions[j][0] == correct[i]:
            reciprocalRank = 1.0/(j+1)
            break
    meanReciprocalRank += reciprocalRank
    print i

#print float(avgTime)/len(error)
print float(meanReciprocalRank)/len(error)
Exemple #10
0
 def GET(self,name):
     user_data = web.input()
     return correct(user_data.text.replace('sapiAGRI ',''))
Exemple #11
0
def test(input_file_name,output_file_name):
    file = open(input_file_name)
    data = file.read()
    element = ET.XML(data)

    outfile = open(output_file_name,"w")
    
    # regex for single word.
    regex_single = re.compile('\w+')

    # regex for phrase.
    regex_phrase = re.compile('\w+|[,|:|(|)|\'|\"]')
    
    # creating the regex obj.
    regex = re.compile('\w+|[.|,|:|(|)|\'|\"]')


    
    for subelement in element:


        if(subelement.attrib['id'][0] == '1'): # testcase1.xml
            outfile.write(subelement.attrib['id'])

            single_word = regex_single.findall(subelement.text)
            
            suggestions = spell_checker.correct(single_word[0])
            
            # printing the misspelled word.
            outfile.write(", "+single_word[0])

            i = 1
            # printing the suggestions.
            for each_suggestion in suggestions:
                outfile.write(", ")
                outfile.write(each_suggestion[0])

                i += 1
                if( i > MAX_SUGGESTIONS):
                    outfile.write("\n")
                    break

        elif( subelement.attrib['id'][0] == '2'): # testcase2.xml
            words_in_phrase = regex_phrase.findall( subelement.text)  

            # identifying the incorrect words in the phrase.
            i = 0;
            for i in xrange(0,len(words_in_phrase)):
                
                # if it is a punctuations then we should not query the dictionary.
                if( isPunctuation(words_in_phrase[i])):
                    continue

                suggestions = spell_checker.correct(words_in_phrase[i])
                if( len(suggestions) == 1): # correct word.
                    continue

                outfile.write(subelement.attrib['id'])
                incorrect_word = words_in_phrase[i]
                outfile.write(", "+incorrect_word)

                temp_list = []
                for each_suggestion in suggestions[:MAX_UNIGRAM_SUGGESTIONS] :
                    words_in_phrase[i] = each_suggestion[0]
                    temp_list.append( ( each_suggestion[0], ngrams.getProbabilityOfPhrase(words_in_phrase,100,False)))


                temp_list.sort( key = getSecond, reverse=True)

                j = 1
                # printing the suggestions.
                for each_temp in temp_list:
                    outfile.write(", ")
                    outfile.write(each_temp[0])

                    j += 1
                    if( j > MAX_SUGGESTIONS):
                        outfile.write("\n")
                        break

        # handling the testcases3
        elif(subelement.attrib['id'][0] == '3'): # testcases3.xml
            sentence = subelement.text

            words_in_sentence = regex.findall(sentence) # TODO: dot is added here.

            # identifying the incorrect words in the sentence.
            i = 0;
            for i in xrange(0,len(words_in_sentence)):
                
                # if it is a punctuations then we should not query the dictionary.
                if( isPunctuation(words_in_sentence[i])):
                    continue

                suggestions = spell_checker.correct(words_in_sentence[i])
                if( len(suggestions) == 1): # correct word.
                    continue

                # patch.
                for k in xrange(0,len(suggestions)):
                    suggestions[k] = suggestions[k][0]


                outfile.write(subelement.attrib['id'])
                incorrect_word = words_in_sentence[i]
                outfile.write(", "+incorrect_word)

                # creating a new list to call the correctSentences function.
                aux_list = []
                for t in xrange(0,(len(words_in_sentence)-1)):
                    aux_list.append([words_in_sentence[t]])
                    
                aux_list[i] = suggestions[:MAX_SUGGESTIONS]
                print aux_list
                sorted_suggestions = pos_ngrams.correctSentences(aux_list)
                
                j = 1
                # printing the suggestions.
                for each_sorted_suggestion in sorted_suggestions:
                    outfile.write(", ")
                    outfile.write(each_sorted_suggestion)

                    j += 1
                    if( j > MAX_SUGGESTIONS):
                        outfile.write("\n")
                        break

                
        # handling the testcases4
        elif(subelement.attrib['id'][0] == '4'): # testcases4.xml

            sentences = subelement.text.split('.')             # dot is the sentence separator.
            
            for each_sentence in sentences:
                words_in_sentence = regex.findall(each_sentence) # a list is returned.
                
                # identifying the incorrect words in the sentence.
                i = 0;
                for i in xrange(0,len(words_in_sentence)):
                
                    # if it is a punctuations then we should not query the dictionary.
                    if( isPunctuation(words_in_sentence[i])):
                        continue

                    suggestions = spell_checker.correct(words_in_sentence[i])
                    if( len(suggestions) == 1): # correct word.
                        continue

                    # patch.
                    for k in xrange(0,len(suggestions)):
                        suggestions[k] = suggestions[k][0]

                    outfile.write(subelement.attrib['id'])
                    incorrect_word = words_in_sentence[i]
                    outfile.write(", "+incorrect_word)

                    # creating a new list to call the correctSentences function.
                    aux_list = []
                    for t in xrange(0,len(words_in_sentence)):
                        aux_list.append([words_in_sentence[t]])
                        
                    aux_list[i] = suggestions[:MAX_SUGGESTIONS]
                    print aux_list
                    sorted_suggestions = pos_ngrams.correctSentences(aux_list)
                    
                    j = 1
                    # printing the suggestions.
                    for each_sorted_suggestion in sorted_suggestions:
                        outfile.write(", ")
                        outfile.write(each_sorted_suggestion)

                        j += 1
                        if( j > MAX_SUGGESTIONS):
                            outfile.write("\n")
                            break

                
                        
    file.close()
    outfile.close()
Exemple #12
0
def correct_spelling(line):
    return ' '.join(
        [spell_checker.correct(word) for word in line.strip().split()])
Exemple #13
0
def cleanText(s):
	s = unicode(s).lower()
	if isinstance(s, unicode):
		s = correct(s)
		s = s.replace(" x "," xby ")
		s = s.replace("*"," xby ")
		s = s.replace(" by "," xby")
		s = s.replace("x0"," xby 0")
		s = s.replace("x1"," xby 1")
		s = s.replace("x2"," xby 2")
		s = s.replace("x3"," xby 3")
		s = s.replace("x4"," xby 4")
		s = s.replace("x5"," xby 5")
		s = s.replace("x6"," xby 6")
		s = s.replace("x7"," xby 7")
		s = s.replace("x8"," xby 8")
		s = s.replace("x9"," xby 9")
		s = s.replace("0x","0 xby ")
		s = s.replace("1x","1 xby ")
		s = s.replace("2x","2 xby ")
		s = s.replace("3x","3 xby ")
		s = s.replace("4x","4 xby ")
		s = s.replace("5x","5 xby ")
		s = s.replace("6x","6 xby ")
		s = s.replace("7x","7 xby ")
		s = s.replace("8x","8 xby ")
		s = s.replace("9x","9 xby ")
		s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s)
		s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s)
		s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s)    
		s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s)    
		s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s)		
		s = re.sub(r"([0-9]+)( *)(degrees|degree)\.?", r"\1deg. ", s)    
		s = re.sub(r"([0-9]+)( *)(volts|volt)\.?", r"\1volt. ", s)		
		s = re.sub(r"([0-9]+)( *)(watts|watt)\.?", r"\1watt. ", s)    
		s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1amp. ", s)		
		s = s.replace("whirpool","whirlpool")
		s = s.replace("whirlpoolga", "whirlpool")
		s = s.replace("whirlpoolstainless","whirlpool stainless")
		s = s.replace("  "," ")
		s = re.sub(r"(\w)\.([A-Z])", r"\1 \2", s) #Split words with a.A
		s = s.lower()
		s = s.replace("  "," ")
		s = s.replace(",","") #could be number / segment later
		s = s.replace("$"," ")
		s = s.replace("?"," ")
		s = s.replace("-"," ")
		s = s.replace("//","/")
		s = s.replace("..",".")
		s = s.replace(" / "," ")
		s = s.replace(" \\ "," ")
		s = s.replace("."," . ")
		s = re.sub(r"(^\.|/)", r"", s)
		s = re.sub(r"(\.|/)$", r"", s)
		s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
		s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
		s = s.replace(" x "," xbi ")
		s = re.sub(r"([a-z])( *)\.( *)([a-z])", r"\1 \4", s)
		s = re.sub(r"([a-z])( *)/( *)([a-z])", r"\1 \4", s)
		s = s.replace("*"," xbi ")
		s = s.replace(" by "," xbi ")
		s = re.sub(r"([0-9])( *)\.( *)([0-9])", r"\1.\4", s)
		s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in. ", s)
		s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft. ", s)
		s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb. ", s)
		s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s)
		s = re.sub(r"([0-9]+)( *)(cubic|cu) ?\.?(feet|foot|ft)\.?", r"\1cu.ft. ", s)
		s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s)
		s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s)
		s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s)
		s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s)
		s = s.replace("°"," degrees ")
		s = re.sub(r"([0-9]+)( *)(degrees|degree)\.?", r"\1deg. ", s)
		s = s.replace(" v "," volts ")
		s = re.sub(r"([0-9]+)( *)(volts|volt)\.?", r"\1volt. ", s)
		s = re.sub(r"([0-9]+)( *)(watts|watt)\.?", r"\1watt. ", s)
		s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1amp. ", s)
		s = s.replace("  "," ")
		s = s.replace(" . "," ")
		s = s.replace("toliet","toilet")
		s = s.replace("airconditioner","air conditioner")
		s = s.replace("vinal","vinyl")
		s = s.replace("vynal","vinyl")
		s = s.replace("skill","skil")
		s = s.replace("snowbl","snow bl")
		s = s.replace("plexigla","plexi gla")
		s = s.replace("rustoleum","rust-oleum")
		s = s.replace("whirpool","whirlpool")
		s = s.replace("whirlpoolga", "whirlpool ga")
		s = s.replace("whirlpoolstainless","whirlpool stainless")
		s = (" ").join([str(strNum[z]) if z in strNum else z for z in s.split(" ")])
		s = tokenizer.tokenize(s)
		s = [word for word in s if word not in stopwords.words('english')]
		return s
Exemple #14
0
def correct_spelling(line):
    return ' '.join([spell_checker.correct(word) for word in line.strip().split()])
Exemple #15
0
def str_correcter(s):
    return " ".join(
        [spell_checker.correct(word) for word in s.lower().split()])