def correct_spelling(word_tokens, word_to_keep): for i in range(len(word_tokens)): w = Word(word_tokens[i]) if w not in word_to_keep: word_tokens[i] = str(w.correct()) return word_tokens
def onlyspellcorrection(query_tokens): "Corrects the spelling errors in the query" corrected_query = [] for t in query_tokens: w = Word(t) a = w.correct() corrected_query.append(a) if cmp(corrected_query, query_tokens) == 0: return query_tokens else: s = "" for i in corrected_query: s += i s += " " print "Did you mean: " + s + "?"
def spell_check(query): #split query splitted_query = query.split() #empty list for spell checked query corrected_query = [] #searching freq_dict in db dict_collection = mongo.db["context_dict_collection"] freq_dict = dict_collection.find_one({"name": "freq_dict"})["freq_dict"] #stop words stop_words = get_stop_words("en") stop_words.append("can") #for each word in splitted query for word in splitted_query: #convert to testblob word blob_word = Word(word) #all the possible corrections to word possible_corrections = blob_word.spellcheck() #initial counter freq_counter = 1 #for the case when spelling is incorrected but no word in document to correct it at_least_one = False #in case the spelling is correct corrected_word = blob_word #for each possible correction in the word for p in possible_corrections: #p[0]'s are the corrections and p[1] scores if p[0] in freq_dict.keys(): #signifies at least one correction is present in dictionary so frequency based correction at_least_one = True #frequency of p[0] frequency = freq_dict[p[0]] else: frequency = 0 #keeping highest frequency and corresponding word in record if frequency >= freq_counter and p[0] not in stop_words: freq_counter = frequency corrected_word = p[0] #no correction was present in dictionary if at_least_one is False: #return correction with highest score corrected_word = blob_word.correct() corrected_query.append(corrected_word) return " ".join(corrected_query)
def spellcorrection(query_tokens): "Corrects the spelling errors in the query" corrected_query = [] for t in query_tokens: w = Word(t) a = w.correct() corrected_query.append(a) if cmp(corrected_query, query_tokens) == 0: return query_tokens else: s = "" for i in corrected_query: s += i s += " " print "Did you mean: " + s + "?" choice = int( raw_input( "Press 1 to continue with the original query, otherwise Press 0\n" )) if choice == 0: return corrected_query else: return query_tokens
print(spanish) from textblob import Word index = Word('index') print(index.pluralize()) animals = TextBlob('dog cat fish sheep bird').words print(animals.pluralize()) cacti = Word('cacti') print(cacti.singularize()) word = Word('theyr') #returns the possible solutions and the percentage that they recomend which option print(word.spellcheck()) word.correct() print(word) sentence = TextBlob('Ths sentense has missplled wrds.') sentence = sentence.correct() print(sentence)
print(blob.sentiment) #evaluates how pos vs neg statment is sentences = blob.sentences for sentence in sentences: print(sentence.sentiment) print(blob.detect_language()) spanish = blob.translate(to="es") #translate statement to spanish! print(spanish) from textblob import Word index = Word("index") print(index.pluralize()) animals = TextBlob('dog cat fish sheep bird').words print(animals.pluralize()) #pluralizes words in blob word = Word("theyr") print(word.spellcheck()) print(word.correct()) #sentence = TextBlob("This sentence has misspelled wrds.")
print(index.pluralize()) cacti = Word("cacti") print(cacti.singularize()) animals = TextBlob("dog cat fish bird").words print(animals.pluralize()) word = Word("theyr") print(word.spellcheck()) corrected_word = word.correct() print(corrected_word) sentence = TextBlob("Ths sentnce has misspeled wrds.") corrected_sentence = sentence.correct() print(corrected_sentence) ############# word1 = Word("studies") word2 = Word("varieties") print(word1.lemmatize()) print(word2.lemmatize())
def correctSpel(str1): w = Word(str1) chk = w.spellcheck() correct1 = w.correct() print correct1
print(w.lemmatize("n")) w = Word("am") print(w.lemmatize("v")) w = Word("are") print(w.lemmatize("v")) w = Word("were") print(w.lemmatize("v")) w = Word("is") print(w.lemmatize("v")) # ============================================================================= # spell check # ============================================================================= w = Word("havv") print(w.spellcheck()) w = w.correct() print(w) # plural word list t = TextBlob( "Data science is an inter-disciplinary fild that uses scientfic methods, processes, algoriths and systems to extract knwledge and insigts from many structural and unstructured data. Data science is related to data mining and big data." ) print(t.spellcheck()) # does not work ... no such methid print(t.correct()) # get individual words words = t.words for w in words: print(w.spellcheck()) print(w.correct()) print("")
string) pattern = re.compile(r"(\w*\s*)*") match = pattern.match(string) print match.group() clean_file.write(match.group().strip()) clean_file.write("\n") clean_file.close() #分词and纠错and标准化 origin_Dict = {} with open("cleaned/ring_clean.txt") as clean: i = 0 for line in clean: list = [] for word in line.split(): w = Word(word) list.append(w.correct().singularize()) if len(list) < 3: continue origin_Dict[i] = list i = i + 1 print i #pos 词性标注,过滤出名词和动词 filter_Dict = {} tag_Dict = {} for index, words in origin_Dict.items(): tag_Dict[index] = nltk.pos_tag(words) for index, tags in tag_Dict.items(): tem = [] for tag in tags: #print tag if tag[1] != 'NN' and tag[1] != 'VB':
sentence = TextBlob('Use 4 spaces per indentation level.') print(sentence.words[2].singularize()) #similarly you can use pluralize print() #word lemmatization w = Word("octopi") print("octopi -> ",w.lemmatize()) w = Word("went") print("went -> ",w.lemmatize("v")) print() #definition print("Octopus : ",Word("octopus").definitions) print() #translation and language detection en_blob = TextBlob(u'Simple is better than complex.') print('Simple is better than complex.') print("SPANISH : ",en_blob.translate(to='es')) en_blob = TextBlob(u'Comment allez vous?') print('Comment allez vous?') print("language : ",en_blob.detect_language()) print() #spell-check w = Word("banama") print("banama") print("correction : ",w.correct()) print("suggestions : ",w.spellcheck()) print()
def classify_search_terms(paths, out_path, dict_path, log_progress=print): sc = SpellChecker() # Has to match the order of type of files # in paths. ptypes = ['organic', 'paid', 'none'] extra_rows = [ 'Portfolio Classification', 'Clicks', 'Impressions', 'Average position' ] out_wb = Workbook(write_only=True) out_ws = out_wb.create_sheet() cls_dict = load_dict(dict_path) open_paths = [] # First pass: open files, # generate simple portfolio classification pc = {} for ip, p in enumerate(paths): wb = load_workbook(p, read_only=True) ws = wb.active open_paths.append(ws) for i, row in enumerate(ws.rows, 1): # Skip first line and headers. if i < 3: continue st = row[0].value st = ' '.join(str(st).lower().split(' ')) if not is_english(st): print('Not english:', repr(st)) continue if st not in pc: pc[st] = [] # Add this classification only # if it's not already there. if ptypes[ip] not in pc[st]: pc[st].append(ptypes[ip]) none_matches = [0, 0, 0] partial_matches = [0, 0, 0] full_matches = [0, 0, 0] # The number of search terms in each portfolio. pt_totals = [0, 0, 0] from_partial_to_full = 0 from_unclass_to_partial = 0 from_unclass_to_full = 0 # Click stats: paid and organic clicks = [0, 0] for ip, ws in enumerate(open_paths): i = 1 log_progress('Starting processing rows...') for row in ws.rows: # Dealing with the first line in input file # and the header, but only when processing the first file. if ip == 0 and i < 3: r = row # Add header only at the beginning if i == 2: r = [row[0].value, 'Semantic Classification'] + extra_rows else: # Rewrite the first line from the input r = [_r.value for _r in r] out_ws.append(r) i += 1 continue elif i < 3: # Skip first lines and headers # for the rest of the files. i += 1 continue # Starting to process the rest of the data # Getting the search term. st = row[0].value print('Search term:', st) if st is None: continue st = str(st).lower().split(' ') sst = ' '.join(st) if sst not in pc: continue csst = pc[sst] if csst == 'added': print('Search term has been already added') continue # Trying to classify. sclsi, tom, um_kws = classify_kws(st, cls_dict) # Try spell correct and classify again sst = ' '.join(st) corrected_st = sst current_tom = tom # Trying to improve those two if tom in ('partial', 'unclassified'): print('Trying to improve ', tom) # Going trought keywords that didn't # have any match for ukw in um_kws: w = Word(ukw) wc = w.correct() # If a spell corrected word is # is new let's use it if wc != ukw: corrected_st = corrected_st.replace(ukw, wc) continue wl = w.lemmatize() if wl != ukw: corrected_st = corrected_st.replace(ukw, wl) continue # If we've changed the search term let's # classify it again if corrected_st != sst: #print('Corrected st from %s->%s' % (sst, corrected_st)) corrected_st = corrected_st.split() sclsi, tom, um_kws = classify_kws(corrected_st, cls_dict) if current_tom == 'unclassified' and tom == 'partial': print('Corrected from %s to %s!' % (current_tom, tom)) st = corrected_st from_unclass_to_partial += 1 elif current_tom in ('unclassified', 'partial') and tom == 'full': print('Corrected from %s to full!' % current_tom) st = corrected_st if current_tom == 'partial': from_partial_to_full += 1 if current_tom == 'unclassified': from_unclass_to_full += 1 if tom == 'full': full_matches[ip] += 1 elif tom == 'partial': partial_matches[ip] += 1 elif tom == 'unclassified': none_matches[ip] += 1 pt_totals[ip] += 1 pclsi = ' | '.join(pc[sst]) # Match this search term as added # to prevent duplicates. pc[sst] = 'added' # Generating the output newrow = [row[0].value, sclsi, pclsi] if ptypes[ip] == 'organic' and pclsi.find('organic') != -1: # Fill in extra rows only from # organic file. newrow += [_r.value for _r in row[1:]] no_clicks = int(row[1].value) # Count clicks if pclsi.find('paid') != -1: clicks[0] += no_clicks if pclsi.find('organic') != -1: clicks[1] += no_clicks out_ws.append(newrow) if i % 1000 == 0: n = datetime.now() m = '%s:generated %d rows.' % (n, i) log_progress(m) print(m) i += 1 #print(st, clsi) max_rows = i print('Total correction from partial to full: %d' % from_partial_to_full) print('Total correction from unclass to partial: %d' % from_unclass_to_partial) print('Total correction from unclass tofull: %d' % from_unclass_to_full) print('Number of clicks', clicks) snames = ['full', 'partial', 'unclassified'] stats = [full_matches, partial_matches, none_matches] out_ws.append(['Total:']) # Total stats for each portfolio. out_ws.append(['Match type/Portfolio'] + ['+'.join(ptypes)]) for i, s in enumerate(stats): spp = '%.2f%%' % ((sum(s) * 100) / sum(pt_totals)) out_ws.append([snames[i]] + [spp]) out_ws.append([]) # Per portfolio stats out_ws.append(['Per Portfolio:']) out_ws.append(['Match type/Portfolio'] + ptypes) for i, s in enumerate(stats): pp = [] for i, m in enumerate(s): if pt_totals[i] == 0: pp.append('0%') else: pp.append('%.2f%%' % ((m * 100) / pt_totals[i])) out_ws.append([snames[i]] + pp) out_ws.append(['Classification improvements:']) out_ws.append(['Unclassified->Partial', from_unclass_to_partial]) out_ws.append(['Partial->Full', from_partial_to_full]) out_ws.append(['Unclassified->Full', from_unclass_to_full]) # Generate venn diagram import tempfile # vd=venn2((clicks[0], abs(clicks[0]-clicks[1]),clicks[1]), set_labels = ('Paid', 'Organic')) # vdf = tempfile.NamedTemporaryFile() # vdf = vdf.name+'.png' # plt.savefig(vdf) # img = drawing.image.Image(vdf) # max_rows=out_ws._max_row+1 # imgcell='A%d' % max_rows # out_ws.add_image(img, imgcell) log_progress('Saving output file...') out_wb.save(out_path)
def classify_search_terms2(): # sc=SpellChecker() # Has to match the order of type of files # in paths. ptypes = ['organic', 'paid', 'none'] extra_rows = [ 'Portfolio Classification', 'Clicks', 'Impressions', 'Average position' ] out_wb = Workbook(write_only=True) out_ws = out_wb.create_sheet() cls_dict = load_dict2() open_paths = [] # First pass: open files, # generate simple portfolio classification pc = {} for term in SearchTerm.objects.all(): st = ' '.join(str(term.query).lower().split(' ')) if not is_english(st): print('Not english:', repr(st)) continue if st not in pc: pc[st] = [] # Add this classification only # if it's not already there. if term.search_portfolio.name not in pc[st]: pc[st].append(term.search_portfolio.name) organic_portfolio = SearchPortfolio.objects.get(name='organic') paid_portfolio = SearchPortfolio.objects.get(name='paid') none_matches = {} # {'organic': 0, 'paid': 0, 'none': 3} partial_matches = {} full_matches = {} pt_totals = {} # {'organic': 0, 'paid': 0, 'none': 3} for portfolio in SearchPortfolio.objects.all(): none_matches[portfolio.name] = 0 partial_matches[portfolio.name] = 0 full_matches[portfolio.name] = 0 pt_totals[portfolio.name] = portfolio.searchterm_set.count() from_partial_to_full = 0 from_unclass_to_partial = 0 from_unclass_to_full = 0 # Click stats: paid and organic clicks = [0, 0] i = 1 classifications_list = [] SearchClassification.objects.all().delete() for term in SearchTerm.objects.all(): # Starting to process the rest of the data # Getting the search term. st = term.query print('%s - Search term: %s' % (term.id, st)) if st is None: continue st = str(st).lower().split(' ') sst = ' '.join(st) if sst not in pc: continue csst = pc[sst] if csst == 'added': print('Search term has been already added') continue # Trying to classify. sclsi, tom, um_kws = classify_kws(st, cls_dict) # Try spell correct and classify again sst = ' '.join(st) corrected_st = sst current_tom = tom # Trying to improve those two if tom in ('partial', 'unclassified'): print('Trying to improve ', tom) # Going trought keywords that didn't # have any match for ukw in um_kws: w = Word(ukw) wc = w.correct() # If a spell corrected word is # is new let's use it if wc != ukw: corrected_st = corrected_st.replace(ukw, wc) continue wl = w.lemmatize() if wl != ukw: corrected_st = corrected_st.replace(ukw, wl) continue # If we've changed the search term let's # classify it again if corrected_st != sst: #print('Corrected st from %s->%s' % (sst, corrected_st)) corrected_st = corrected_st.split() sclsi, tom, um_kws = classify_kws(corrected_st, cls_dict) if current_tom == 'unclassified' and tom == 'partial': print('Corrected from %s to %s!' % (current_tom, tom)) st = corrected_st from_unclass_to_partial += 1 elif current_tom in ('unclassified', 'partial') and tom == 'full': print('Corrected from %s to full!' % current_tom) st = corrected_st if current_tom == 'partial': from_partial_to_full += 1 if current_tom == 'unclassified': from_unclass_to_full += 1 if tom == 'full': full_matches[term.search_portfolio.name] += 1 elif tom == 'partial': partial_matches[term.search_portfolio.name] += 1 elif tom == 'unclassified': none_matches[term.search_portfolio.name] += 1 pclsi = ' | '.join(pc[sst]) # Match this search term as added # to prevent duplicates. pc[sst] = 'added' # Generating the output newrow = [term.query, sclsi, pclsi] if term.search_portfolio.name == 'organic' and pclsi.find( 'organic') != -1: # Fill in extra rows only from # organic file. newrow += [ term.clicks, term.impressions, term.avg_position, term.cost, term.conversions, term.total_conv ] no_clicks = int(term.clicks) # Count clicks if pclsi.find('paid') != -1: clicks[0] += no_clicks if pclsi.find('organic') != -1: clicks[1] += no_clicks out_ws.append(newrow) if i % 1000 == 0: n = datetime.now() m = '%s:generated %d rows.' % (n, i) print(m) i += 1 search_classification = SearchClassification( query=term.query, search_dictionary_ids=sclsi, portfolio_ids=pclsi) if pclsi.find('organic') != -1: search_term = SearchTerm.objects.filter( query=term.query, search_portfolio_id=organic_portfolio.id).first() search_classification.organic_clicks = search_term.clicks search_classification.organic_impressions = search_term.impressions search_classification.organic_avg_position = search_term.avg_position if pclsi.find('paid') != -1: search_term = SearchTerm.objects.filter( query=term.query, search_portfolio_id=paid_portfolio.id).first() search_classification.paid_clicks = search_term.clicks search_classification.paid_impressions = search_term.impressions search_classification.paid_cost = search_term.cost search_classification.paid_conversions = search_term.conversions search_classification.paid_total_conv = search_term.total_conv print("query - %s" % search_classification.query) print("semantic - %s" % search_classification.search_dictionary_ids) search_classification.save() # classifications_list.append(search_classification) # SearchClassification.objects.bulk_create(classifications_list) print("SearchClassification records are created.") print('*****************') print(none_matches) print(partial_matches) print(full_matches) print(pt_totals) max_rows = i print("max_rows: %s" % max_rows) max_rows = SearchTerm.objects.count() print("max_rows: %s" % max_rows) print('Total correction from partial to full: %d' % from_partial_to_full) print('Total correction from unclass to partial: %d' % from_unclass_to_partial) print('Total correction from unclass tofull: %d' % from_unclass_to_full) print('Number of clicks', clicks) snames = ['full', 'partial', 'unclassified'] stats = [full_matches, partial_matches, none_matches] out_ws.append(['Total:']) # Total stats for each portfolio. out_ws.append(['Match type/Portfolio'] + ['+'.join(ptypes)]) total_match = {} match = [] for i, s in enumerate(stats): spp = '%.2f%%' % ((sum(s.values()) * 100) / sum(pt_totals.values())) out_ws.append([snames[i]] + [spp]) total_match[snames[i]] = (sum(s.values()) * 100) / sum( pt_totals.values()) out_ws.append([]) # Per portfolio stats out_ws.append(['Per Portfolio:']) out_ws.append(['Match type/Portfolio'] + ptypes) for i, s in enumerate(stats): pp = [] match_t = [] for item in s.items(): portfolio = SearchPortfolio.objects.get(name=item[0]) if pt_totals[item[0]] == 0: pp.append('0%') match_t.append((portfolio.id, 0)) else: pp.append('%.2f%%' % ((item[1] * 100) / pt_totals[item[0]])) match_t.append((portfolio.id, round((item[1] * 100) / pt_totals[item[0]], 2))) out_ws.append([snames[i]] + pp) match.append(match_t) out_ws.append(['Classification improvements:']) out_ws.append(['Unclassified->Partial', from_unclass_to_partial]) out_ws.append(['Partial->Full', from_partial_to_full]) out_ws.append(['Unclassified->Full', from_unclass_to_full]) total_match['unclassified_partial'] = from_unclass_to_partial total_match['partial_full'] = from_partial_to_full total_match['unclassified_full'] = from_unclass_to_full SearchMatchTotal.objects.all().delete() total = SearchMatchTotal(**total_match) total.save() match_list = [] for i in range(0, len(match[0])): item = [item[i] for item in match] print(item) match_list.append( SearchMatch(full=item[0][1], partial=item[1][1], unclassified=item[2][1], search_portfolio_id=item[0][0])) SearchMatch.objects.all().delete() SearchMatch.objects.bulk_create(match_list) # Generate venn diagram # out_path = '/home/engineer/Projects/Python/Shilo/classified.xlsx' # out_wb.save(out_path)
#print line line = line.translate(None, '",.()!;/?0123456789') #过滤标点和数字 #tem = TextBlob(line) #s = tem.correct() list = [] for word in line.split(): list.append(str(word).strip()) wordDict[i] = list i = i + 1 #SpellCorrect 这里使用的是TEXTBLOB库。花时间较长。 correct_Dict = {} for index, words in wordDict.items(): list = [] for word in words: w = Word(word) list.append(w.correct()) correct_Dict[index] = list #Singularization 使用TEXTBLOB,花时间较长。 singular_Dict = {} for index, words in correct_Dict.items(): list = [] for word in words: list.append(str(word.singularize())) singular_Dict[index] = list #stemming 词干提取 提取后有些单词不知道什么意思了。所以注释掉了。使用的nltk ''' stemed_Dict = {}#词干提取后的词典 from nltk.stem import PorterStemmer pst = PorterStemmer() for index,words in singular_Dict.items():
# Section 12.2.9 snippets from textblob import Word word = Word('theyr') word.spellcheck() word.correct() # chooses word with the highest confidence value from textblob import TextBlob sentence = TextBlob('Ths sentense has missplled wrds.') sentence.correct() ########################################################################## # (C) Copyright 2019 by Deitel & Associates, Inc. and # # Pearson Education, Inc. All Rights Reserved. # # # # DISCLAIMER: The authors and publisher of this book have used their # # best efforts in preparing the book. These efforts include the # # development, research, and testing of the theories and programs # # to determine their effectiveness. The authors and publisher make # # no warranty of any kind, expressed or implied, with regard to these # # programs or to the documentation contained in these books. The authors # # and publisher shall not be liable in any event for incidental or # # consequential damages in connection with, or arising out of, the # # furnishing, performance, or use of these programs. # ##########################################################################
# for sentence in sent_list: # shows scores for each sentence # print(sentence.sentiment) # took off round, sentiment, and polarity because using different analyzer thatd doesnt require it # ### shows first sentence is positive and second is negative # ########################################################## # print(blob.detect_language()) ### finds out which language this is in # spanish = blob.translate(to='es') # print(spanish) # german = blob.translate(to='de') # translates the "blob" variable to german # print(german) ########################################################### from textblob import Word my_word = Word('theyr') print(my_word.spellcheck() ) ## [('they', 0.5713042216741622), ('their', 0.42869577832583783)] ### means confidence is higher that the word is supposed to be 'they' new_word = my_word.correct( ) ## corrects it to 'they' automatically because confidence is higher for this print(new_word) my_sentence = TextBlob('Ths sentense has missplld wrds.') new_sentence = my_sentence.correct() print(new_sentence)
from textblob import TextBlob, Word from textblob.en import Spelling MOCKDATA = "Hi, I cant spel at al, snd hlp." test = TextBlob(MOCKDATA) # spelling = Spelling(path=path) # spelling.train() for data in test.words: temp = Word(data) tempTuples = temp.spellcheck() print(temp) print(temp.spellcheck()) print(temp.correct()) for tuples in tempTuples: print(tuples[0]) print(tuples[1]) # print(tuples.confidence)
def spellcheck(word): word_object = Word(word) return word_object.correct()
for n in para.sentences[1].noun_phrases: print(n) for t in para.sentences[1].tags: print(t) print(para.sentences[2].words[1].singularize()) #Word operation from textblob import Word Word('horsases').singularize() w = Word('better') w.singularize() w.pluralize() w.correct() w.lemmatize('a') w = Word('horss') w.spellcheck() #ngrams blob = TextBlob('I live in Modinagar') print(blob.ngrams(2)) print(blob.ngrams(3)) #sentiments blob = TextBlob('you are wise man')