def correct(x): line=re.findall(r"\b([a-zA-Z]+)\b", x) y='' for word in line: spell.correct(x) y=y+(spell.correct(word))+' ' return y
def queryIndex(self): self.inputFile() self.readIndex() self.getStopwords() while True: print 'Enter query' #q=sys.stdin.readline() q = raw_input() if q == '': break print 'Enter number:' s = raw_input() if s == '1': print lmtzr.lemmatize(q) elif s == '2': print spell.correct(q, self.spellCheck) else: qt = self.queryType(q) if qt == 'one_word': self.one_word(q) elif qt == 'free': self.free(q) elif qt == 'phrase': self.phrase(q)
def correct(x): line = re.findall(r"\b([a-zA-Z]+)\b", x) y = '' for word in line: spell.correct(x) y = y + (spell.correct(word)) + ' ' return y
def one_word(self, q): '''One Word Query''' originalQuery = q q = self.getTerms(q) if len(q) == 0: print '' return elif len(q) > 1: self.free(originalQuery) return #q contains only 1 term term = q[0] if term not in self.index: term = spell.correct(term, self.spellCheck) #term=spell.correct(term) print 'did you meant', term = lmtzr.lemmatize(term) q = [] q.append(term) print term postings = self.index[term] #print postings docs = [x[0] for x in postings] #print docs self.ranking(q, docs)
def correctedWord(word, d): #suggest = d.suggest(word) #if len(suggest) > 0: # return random.choice(suggest) #else: # return word ret = random.sample(spell.correct(word), 1) return ret[0]
def correct_answer(NWORDS, all_words,student_answers): spell_corrected_answers = [] for answer in student_answers: answer_words = [] for word in answer[0].split(): answer_words.append(spell.correct(word, NWORDS)) answer_string = " ".join(answer_words) spell_corrected_answers.append((answer_string,answer[1])) return spell_corrected_answers
def correct_answer(NWORDS, all_words, student_answers): spell_corrected_answers = [] for answer in student_answers: answer_words = [] for word in answer[0].split(): answer_words.append(spell.correct(word, NWORDS)) answer_string = " ".join(answer_words) spell_corrected_answers.append((answer_string, answer[1])) return spell_corrected_answers
def spell_correct(tweet): #Spitting the sentence into words words = tweet.split() text = "" for w in words: #spell correction module in spell.py w = correct(w) text += w + " " return text
def sanitize(text): ret = [] for word in text: word = tolower(word) word = goodify(word) word = correct(word) if word not in stop_words: ret.append(word) else: continue return ret
def get_responses(message, weights=word_weights, sents=categorized_sentences, clas=classifier, threshold=0.4, n=10): # pre tokenize and spelling correct message tkn_message = nltk.word_tokenize(correct(message)) print(tkn_message) has_non_stop = False # check if message has any words that are in the dictionary for word in tkn_message: if word in word_weights: has_non_stop = True # get response types typs = response_types[clas.classify(sentence_features(tkn_message))] print(typs) relevant = [] i = 0 # if some words are in the dictionary, look for a sentence with sufficient similarity if has_non_stop: for s in sents: # decrement threshold every 500 words to speed up search if i > 500 and threshold >= 0.1: threshold -= 0.05 i = 0 i += 1 sm = sent_similarity(tkn_message, nltk.word_tokenize(s[0]), weights) if sm >= threshold and s[1] in typs: relevant.append((s, sm)) if len(relevant) >= n: break # if no dictionary words are found, give up and just look for a sentence with the right type else: for s in sents: if s[1] in typs: relevant.append((s, 0)) if len(relevant) >= n: break # sort by relevance relevant.sort(key=lambda x: x[1]) if relevant: return relevant[:10] else: return []
def getScore(n): text = nltk.corpus.brown.sents() ss = [] wss = [] for i in xrange(n): s = [] while len(s) <= 2 or s[0][0] > "a" or s[-1] != ".": s = text[int(random.uniform(0, len(text)))] ss.append(s) ws = list(s) w = [] while len(w) <= 1 or "." in w: j = int(random.uniform(0, len(ws) - 1)) w = list(ws[j]) k = int(random.uniform(0, len(w))) c = chr(int(random.uniform(ord("a"), ord("z") + 1))) w[k] = c ws[j] = "".join(w) # print c, s[j], ws[j] assert len(s[j]) == len(ws[j]) wss.append(ws) ss = unsentences(ss) wss = unsentences(wss) css = spell.correct(wss) ss = splitSentence(ss) wss = splitSentence(wss) css = splitSentence(css) dd = 0 for (s, ws, cs) in map(None, ss, wss, css): print unwords(s) print unwords(ws) print unwords(cs) d = distance.distance(s, cs) print d print if d > 0: dd += 1 return float(dd) / n
def normal_srch(query): start_time=time.time() wrong_words=index_search(query) split_query=str(query).split() corrected_word="" flag=0 #if wrong_words is not empty then generate corrected string if wrong_words: for x in split_query: if x in wrong_words: corrected_word = corrected_word + " " + spell.correct(x) else: corrected_word=corrected_word + " " + x final_result=index_search(corrected_word) flag=1 query_time=time.time()-start_time if wrong_words and not final_result: print "Did you mean... " for x in corrected_word.split(): if x in split_query: print "%s"%x else: print "<font color=blue><i>%s</i></font>"%x still_words_remaining = not(len(wrong_words) == len(split_query)) if not wrong_words or not final_result or still_words_remaining: if flag==1 : ##case when input string has been corrected.. evaluate(corrected_word) else: evaluate(query) print "<br>Your search query took <b><font color=blue>%s</font></b> seconds.<br><hr>"%query_time print_results() else: print "<br>Your search <b>%s</b> did not match any document"%(query)
def get_stats(self, line2, label, badwords, negword_list, posword_list): codecs.register_error('replace_with_space', self.handler) #count uppercase letters caps_count = sum(x.isupper() for x in line2) #remove garbage, lowercase & strip line = ''.join(filter(lambda x: ord(x)<128,line2.lower().strip())) #remove double quotes line = line[1:-1] #decode to ascii line = line.decode('string-escape').decode('utf-8','replace_with_space').encode('ascii','ignore').decode('unicode-escape').encode('iso-8859-1','replace_with_space') #remove @name line = re.sub(r'^@\w{2,}', r'NameOfPerson', line) #count words with unwanted repetitions rep_count = len(re.findall(r'(.)\1\1+',line)) #remove unwanted repetitions line = re.sub(r'(.)\1\1+', r'\1', line) #replace badwords for badword in badwords: line = re.sub(r"\b"+re.escape(badword)+r"\b|[!@#$%^&*+?~`]{3,}", r'xxbdWrdxx', line) #replace 'u' with 'you' & 'ur' with 'you are' line = re.sub(r"\bu\b", r'you', line) line = re.sub(r"\bu\s*r\b", r'you are', line) #correct spelling tmp_line = [] for word in re.split(r"[^\w\,\'\.\-\?\!]+", line): tmp_line.append(spell.correct(word)) line = ' '.join(tmp_line) #count negative words negword_count = 0 for negword in negword_list: negword_count += line.count(negword.strip()) #count positive words posword_count = 0 for posword in posword_list: posword_count += line.count(posword.strip()) #---categorize counts--- #categorize badword_count badword_count = line.count("xxbdWrdxx") if badword_count >=3: badword_count = 3 #categorize rep_count if rep_count == 2: rep_count = 1 elif rep_count >=3: rep_count = 2 #categorize negword_count if negword_count == 2: negword_count = 1 elif negword_count >= 3 and negword_count <= 7: negword_count = 2 elif negword_count >= 8: negword_count = 3 #categorize posword_count if posword_count == 2: posword_count = 1 elif posword_count >= 3 and posword_count <= 6: posword_count = 2 elif posword_count >= 7: posword_count = 3 #categorize caps_count if caps_count == 2: caps_count = 1 elif caps_count >=3 and caps_count <=5: caps_count = 2 elif caps_count >=6: caps_count = 3 #write processed line and stats to file return [label, badword_count, rep_count, negword_count, posword_count, "\""+line+"\"", caps_count]
# documents.setdefault("doc2", doc2) documents = {} for filename in os.listdir('story'): f = open('story//' + filename).read() documents.setdefault(filename.decode('utf-8'), f) for doc_id, text in documents.iteritems(): doc_index = inverted_index(text) inverted_index_add(inverted, doc_id, doc_index) # Print Inverted-Index for word, doc_locations in inverted.iteritems(): print word, doc_locations # # Search something and print results queries = [spell.correct('good')] for query in queries: result_docs = search(inverted, query) print "Search for '%s': %s" % (query, u','.join(result_docs.keys())) # %s是str()输出字符串%r是repr()输出对象 def extract_text(doc, index): return documents[doc].decode('utf-8')[index:index + 30].replace('\n', ' ') if result_docs: result_docs_list = sorted(result_docs.items(),key=lambda x:len(x[1]), reverse = True) # result_docs = dict(result_docs_list) for doc, offsets in result_docs_list: for offset in offsets:
(c1, c2, q0, q2, flag, nr, r) = win.getevent() if c2 in "xX": if flag & 2: win.getevent() if flag & 8: win.getevent() win.getevent() win.writeevent(c1, c2, q0, q2) if c2 == "x" and r == "Del": outwin.delete() break if c1 == "K" and c2 == "I": ch = r[0] if ch in " \t\r\n": outwin.replace(",", "") continue while q0 >= 0 and not (ch in " \t\r\n"): sss = win.read(q0, q0+1) if not sss: # print("empty sss %d" % q0) sss = " " ch = sss[0] q0 -= 1 if q0 < 0 and not(ch in " \t\r\n"): q0 = 0 else: q0 += 2 ss = win.read(q0,q2) lastcorrect = spell.correct(ss) outwin.replace(",", lastcorrect)
def stem(word): # word = stemmer.stem(word) word = correct(word) return word
def spellCorrector(self, text): tmp_line = [] for word in re.split(r"[^\w\,\'\.\-\?\!]+", text): tmp_line.append(spell.correct(word)) line = ' '.join(tmp_line) return line
def post(self): userInput = cgi.escape(self.request.get('text')) pageVar = {'word': userInput, 'correct': spell.correct(userInput)} page = JINJA_ENVIRONMENT.get_template('main.html') self.response.write(page.render(pageVar))
#-*- coding: utf-8 -*- import random from spell import correct x = correct('والسسلام') #print unicode(x,'utf-8')