Beispiel #1
0
def correct(x):
	line=re.findall(r"\b([a-zA-Z]+)\b", x)
	y=''
	for word in line:
		spell.correct(x)
		y=y+(spell.correct(word))+' '
	return y
Beispiel #2
0
    def queryIndex(self):
        self.inputFile()
        self.readIndex()
        self.getStopwords()

        while True:
            print 'Enter query'
            #q=sys.stdin.readline()
            q = raw_input()
            if q == '':
                break
            print 'Enter number:'
            s = raw_input()
            if s == '1':
                print lmtzr.lemmatize(q)
            elif s == '2':
                print spell.correct(q, self.spellCheck)
            else:
                qt = self.queryType(q)
                if qt == 'one_word':
                    self.one_word(q)
                elif qt == 'free':
                    self.free(q)
                elif qt == 'phrase':
                    self.phrase(q)
Beispiel #3
0
def correct(x):
    line = re.findall(r"\b([a-zA-Z]+)\b", x)
    y = ''
    for word in line:
        spell.correct(x)
        y = y + (spell.correct(word)) + ' '
    return y
Beispiel #4
0
    def one_word(self, q):
        '''One Word Query'''
        originalQuery = q
        q = self.getTerms(q)
        if len(q) == 0:
            print ''
            return
        elif len(q) > 1:
            self.free(originalQuery)
            return

        #q contains only 1 term
        term = q[0]
        if term not in self.index:

            term = spell.correct(term, self.spellCheck)
            #term=spell.correct(term)
            print 'did you meant',
            term = lmtzr.lemmatize(term)
            q = []
            q.append(term)
            print term
        postings = self.index[term]
        #print postings
        docs = [x[0] for x in postings]
        #print docs
        self.ranking(q, docs)
Beispiel #5
0
def correctedWord(word, d):
	#suggest = d.suggest(word)
	#if len(suggest) > 0:
	#	return random.choice(suggest)
	#else:
	#	return word
	ret = random.sample(spell.correct(word), 1)
	return ret[0]
Beispiel #6
0
def correct_answer(NWORDS, all_words,student_answers):
	spell_corrected_answers = []
	for answer in student_answers:
		answer_words = []
		for word in answer[0].split():
			answer_words.append(spell.correct(word, NWORDS))
		answer_string = " ".join(answer_words)
		spell_corrected_answers.append((answer_string,answer[1]))
	return spell_corrected_answers
Beispiel #7
0
def correct_answer(NWORDS, all_words, student_answers):
    spell_corrected_answers = []
    for answer in student_answers:
        answer_words = []
        for word in answer[0].split():
            answer_words.append(spell.correct(word, NWORDS))
        answer_string = " ".join(answer_words)
        spell_corrected_answers.append((answer_string, answer[1]))
    return spell_corrected_answers
def spell_correct(tweet):
    #Spitting the sentence into words
    words = tweet.split()
    text = ""
    for w in words:
        #spell correction module in spell.py
        w = correct(w)
        text += w + " "

    return text
Beispiel #9
0
def sanitize(text):
	ret = []
	for word in text:
		word = tolower(word)
		word = goodify(word)
		word = correct(word)
		if word not in stop_words:
			ret.append(word)
		else:
			continue
	return ret
Beispiel #10
0
def sanitize(text):
    ret = []
    for word in text:
        word = tolower(word)
        word = goodify(word)
        word = correct(word)
        if word not in stop_words:
            ret.append(word)
        else:
            continue
    return ret
Beispiel #11
0
def get_responses(message, weights=word_weights, sents=categorized_sentences, clas=classifier, threshold=0.4, n=10):
    # pre tokenize and spelling correct message
    tkn_message = nltk.word_tokenize(correct(message))
    print(tkn_message)
    has_non_stop = False
    # check if message has any words that are in the dictionary
    for word in tkn_message:
        if word in word_weights:
            has_non_stop = True
    # get response types
    typs = response_types[clas.classify(sentence_features(tkn_message))]
    print(typs)
    relevant = []
    i = 0
    # if some words are in the dictionary, look for a sentence with sufficient similarity
    if has_non_stop:
        for s in sents:
            # decrement threshold every 500 words to speed up search
            if i > 500 and threshold >= 0.1:
                threshold -= 0.05
                i = 0
            i += 1
            sm = sent_similarity(tkn_message, nltk.word_tokenize(s[0]), weights)
            if sm >= threshold and s[1] in typs:
                relevant.append((s, sm))
            if len(relevant) >= n:
                break
    # if no dictionary words are found, give up and just look for a sentence with the right type
    else:
        for s in sents:
            if s[1] in typs:
                relevant.append((s, 0))
            if len(relevant) >= n:
                break

    # sort by relevance
    relevant.sort(key=lambda x: x[1])
    if relevant:
        return relevant[:10]
    else:
        return []
Beispiel #12
0
def getScore(n):
    text = nltk.corpus.brown.sents()
    ss = []
    wss = []
    for i in xrange(n):
        s = []
        while len(s) <= 2 or s[0][0] > "a" or s[-1] != ".":
            s = text[int(random.uniform(0, len(text)))]
        ss.append(s)

        ws = list(s)
        w = []
        while len(w) <= 1 or "." in w:
            j = int(random.uniform(0, len(ws) - 1))
            w = list(ws[j])
        k = int(random.uniform(0, len(w)))
        c = chr(int(random.uniform(ord("a"), ord("z") + 1)))
        w[k] = c
        ws[j] = "".join(w)
        # print c, s[j], ws[j]
        assert len(s[j]) == len(ws[j])
        wss.append(ws)

    ss = unsentences(ss)
    wss = unsentences(wss)
    css = spell.correct(wss)
    ss = splitSentence(ss)
    wss = splitSentence(wss)
    css = splitSentence(css)
    dd = 0
    for (s, ws, cs) in map(None, ss, wss, css):
        print unwords(s)
        print unwords(ws)
        print unwords(cs)
        d = distance.distance(s, cs)
        print d
        print
        if d > 0:
            dd += 1
    return float(dd) / n
def normal_srch(query):
	start_time=time.time()				
	wrong_words=index_search(query)
	split_query=str(query).split()
	corrected_word=""
	flag=0					#if wrong_words is not empty then generate corrected string
	if wrong_words:
		
		for x in split_query:
			if x in wrong_words:
				corrected_word = corrected_word + " " + spell.correct(x)				
			else:
				corrected_word=corrected_word + " " + x
		
		final_result=index_search(corrected_word)
		flag=1

	query_time=time.time()-start_time
	
	if wrong_words and not final_result:
		print "Did you mean... "
		for x in corrected_word.split():
			
			if x in split_query:
				print "%s"%x
			else:
				print "<font color=blue><i>%s</i></font>"%x
	
	still_words_remaining = not(len(wrong_words) == len(split_query))
	
	if not wrong_words or not final_result or still_words_remaining:
		if flag==1 :					##case when input string has been corrected..
			evaluate(corrected_word)
		else:
			evaluate(query)
		print "<br>Your search query took <b><font color=blue>%s</font></b> seconds.<br><hr>"%query_time
		print_results()
		
	else:
		print "<br>Your search <b>%s</b> did not match any document"%(query)		
	def get_stats(self, line2, label, badwords, negword_list, posword_list):
		codecs.register_error('replace_with_space', self.handler) 
			
			#count uppercase letters
		caps_count = sum(x.isupper() for x in line2)
	
			#remove garbage, lowercase & strip
		line = ''.join(filter(lambda x: ord(x)<128,line2.lower().strip()))
	
			#remove double quotes
		line = line[1:-1]
	
			#decode to ascii
		line = line.decode('string-escape').decode('utf-8','replace_with_space').encode('ascii','ignore').decode('unicode-escape').encode('iso-8859-1','replace_with_space')
	
			#remove @name
		line = re.sub(r'^@\w{2,}', r'NameOfPerson', line)
	
			#count words with unwanted repetitions
		rep_count = len(re.findall(r'(.)\1\1+',line))
	
			#remove unwanted repetitions
		line = re.sub(r'(.)\1\1+', r'\1', line)
	
			#replace badwords
		for badword in badwords:
			line = re.sub(r"\b"+re.escape(badword)+r"\b|[!@#$%^&*+?~`]{3,}", r'xxbdWrdxx', line)
	
			#replace 'u' with 'you' & 'ur' with 'you are'
		line = re.sub(r"\bu\b", r'you', line)
		line = re.sub(r"\bu\s*r\b", r'you are', line)
			
			#correct spelling
		tmp_line = []
		for word in re.split(r"[^\w\,\'\.\-\?\!]+", line):
			tmp_line.append(spell.correct(word))
		line = ' '.join(tmp_line)
	
			#count negative words
		negword_count = 0
		for negword in negword_list:
			negword_count += line.count(negword.strip())

		#count positive words
		posword_count = 0
		for posword in posword_list:
			posword_count += line.count(posword.strip())
		
		#---categorize counts---
		#categorize badword_count
		badword_count = line.count("xxbdWrdxx")
		if badword_count >=3:
			badword_count = 3
		#categorize rep_count
		if rep_count == 2:
			rep_count = 1
		elif rep_count >=3:
			rep_count = 2
		#categorize negword_count
		if negword_count == 2:
			negword_count = 1
		elif negword_count >= 3 and negword_count <= 7:
			negword_count = 2
		elif negword_count >= 8:
			negword_count = 3
		#categorize posword_count
		if posword_count == 2:
			posword_count = 1
		elif posword_count >= 3 and posword_count <= 6:
			posword_count = 2
		elif posword_count >= 7:
			posword_count = 3
		#categorize caps_count
		if caps_count == 2:
			caps_count = 1
		elif caps_count >=3 and caps_count <=5:
			caps_count = 2
		elif caps_count >=6:
			caps_count = 3
	
		#write processed line and stats to file
		return [label, badword_count, rep_count, negword_count, posword_count, "\""+line+"\"", caps_count]
Beispiel #15
0
    # documents.setdefault("doc2", doc2)

    documents = {}
    for filename in os.listdir('story'):
        f = open('story//' + filename).read()
        documents.setdefault(filename.decode('utf-8'), f)
    for doc_id, text in documents.iteritems():
        doc_index = inverted_index(text)
        inverted_index_add(inverted, doc_id, doc_index)

    # Print Inverted-Index
    for word, doc_locations in inverted.iteritems():
        print word, doc_locations

    # # Search something and print results
    queries = [spell.correct('good')]
    for query in queries:
        result_docs = search(inverted, query)
        print "Search for '%s': %s" % (query, u','.join(result_docs.keys()))  # %s是str()输出字符串%r是repr()输出对象


        def extract_text(doc, index):
            return documents[doc].decode('utf-8')[index:index + 30].replace('\n', ' ')


        if result_docs:
            result_docs_list = sorted(result_docs.items(),key=lambda x:len(x[1]), reverse = True)
            # result_docs = dict(result_docs_list)

            for doc, offsets in result_docs_list:
                for offset in offsets:
Beispiel #16
0
    (c1, c2, q0, q2, flag, nr, r) = win.getevent()
    if c2 in "xX":
        if flag & 2:
            win.getevent()
        if flag & 8:
            win.getevent()
            win.getevent()
        win.writeevent(c1, c2, q0, q2)
        if c2 == "x" and r == "Del":
            outwin.delete()
            break
    if c1 == "K" and c2 == "I":
        ch = r[0]
        if ch in " \t\r\n":
            outwin.replace(",", "")
            continue
        while q0 >= 0 and not (ch in " \t\r\n"):
            sss = win.read(q0, q0+1)
            if not sss:
                # print("empty sss %d" % q0)
                sss = " "
            ch = sss[0]
            q0 -= 1
        if q0 < 0 and not(ch in " \t\r\n"):
            q0 = 0
        else:
            q0 += 2
        ss = win.read(q0,q2)
        lastcorrect = spell.correct(ss)
        outwin.replace(",", lastcorrect)
Beispiel #17
0
def stem(word):
	# word = stemmer.stem(word)
	word = correct(word)
	return word
 def spellCorrector(self, text):
     tmp_line = []
     for word in re.split(r"[^\w\,\'\.\-\?\!]+", text):
         tmp_line.append(spell.correct(word))
     line = ' '.join(tmp_line)
     return line
Beispiel #19
0
 def post(self):
     userInput = cgi.escape(self.request.get('text'))
     pageVar = {'word': userInput, 'correct': spell.correct(userInput)}
     page = JINJA_ENVIRONMENT.get_template('main.html')
     self.response.write(page.render(pageVar))
Beispiel #20
0
#-*- coding: utf-8 -*-
import random

from spell import correct

x = correct('والسسلام')
#print unicode(x,'utf-8')
Beispiel #21
0
 def post(self):
     userInput = cgi.escape(self.request.get('text'))
     pageVar = {'word': userInput, 'correct': spell.correct(userInput)}  
     page = JINJA_ENVIRONMENT.get_template('main.html')
     self.response.write(page.render(pageVar))