def get_close_matches(word, possibilities, n=3, cutoff=None, case_sensitive=False): """ Given a string and a list of strings to lookup, returns a list of pairs of the n-closest strings (according to the edit-distance), and their respective distances from 'word'. If cutoff is given, the returned list will contain only the string which are closer than the cutoff. """ try: from nltk.metrics import edit_distance except ImportError: from Levenshtein import distance as edit_distance hits = [] for possibility in possibilities: if case_sensitive: d = edit_distance(word, possibility) else: d = edit_distance(word.lower(), possibility.lower()) if cutoff and d < cutoff: hits.append((possibility, d)) return sorted(hits, cmp=lambda x, y: cmp(x[1], y[1]))
def linguistic_distance(self): """ Compare the two languages word by word for all meanings in the meaning_list. :return: the linguistic similarity between the two languages """ eds = [] norms = [] for meaning in self.meaning_list: word1 = self.language1_dict.get(meaning) word2 = self.language2_dict.get(meaning) if word1 != None and word2 != None: word1 = word1.words[0] word2 = word2.words[0] LDN = edit_distance(word1, word2) / float( max(len(word1), len(word2))) eds += [LDN] for meaning in self.meaning_list: for meaning2 in self.meaning_list: if meaning != meaning2: word1 = self.language1_dict.get(meaning) word2 = self.language2_dict.get(meaning2) if word1 != None and word2 != None: word1 = word1.words[0] word2 = word2.words[0] LDN = edit_distance(word1, word2) / float( max(len(word1), len(word2))) norms += [LDN] average = sum(norms) / float(len(norms)) LDND = [ed / average for ed in eds] return sum(LDND) / len(LDND)
def cer_from_transcripts(transcripts, ys, log_path=None, truncate=True): ''' Args: transcripts: list of strings Return: norm_dists: list of CER values dist: edit distances ''' norm_dists = [] dists = [] for i, t in enumerate(transcripts): curr_t = t curr_y = ys[i] if len(curr_y) == 0: print('%d is 0' % i) curr_t_nos = curr_t.replace(' ', '') curr_y_nos = curr_y.replace(' ', '') if truncate: curr_t = curr_t[:len(curr_y)] curr_t_nos = curr_t_nos[:len(curr_y_nos)] dist = edit_distance(curr_t, curr_y) norm_dist = dist / len(curr_y) dist_nos = edit_distance(curr_t_nos, curr_y_nos) norm_dist_nos = dist_nos / len(curr_y_nos) best_dist = min(dist, dist_nos) best_norm = min(norm_dist, norm_dist_nos) if log_path is not None: with open(log_path, 'a') as ouf: ouf.write('dist: %.2f, norm_dist: %.2f\n' % (best_dist, best_norm)) norm_dists.append(best_norm) dists.append(best_dist) return norm_dists, dists
def cer_from_transcripts(transcripts, ys, log_path, truncate=True, spaces='best'): ''' Return: norm_dists: list of CER values dist: edit distances spaces: no, yes, best (to account for incongruity in raw data spacing) ''' norm_dists = [] dists = [] for i, t in enumerate(transcripts): curr_t = t curr_y = ys[i] if len(curr_y) == 0: print('%d is 0' % i) curr_t_nos = curr_t.replace(' ', '') curr_y_nos = curr_y.replace(' ', '') if truncate: curr_t = curr_t[:len(curr_y)] curr_t_nos = curr_t_nos[:len(curr_y_nos)] dist = edit_distance(curr_t, curr_y) norm_dist = dist / len(curr_y) dist_nos = edit_distance(curr_t_nos, curr_y_nos) norm_dist_nos = dist_nos / len(curr_y_nos) best_dist = min(dist, dist_nos) best_norm = min(norm_dist, norm_dist_nos) with open(log_path, 'a') as ouf: ouf.write('dist: %.2f, norm_dist: %.2f\n' % (best_dist, best_norm)) norm_dists.append(best_norm) dists.append(best_dist) return norm_dists, dists
def get_related_evidence(title): print '>>>>>>>>>>>>>>>>>>>>>>>>>>' try: print 'given title: ' + title # TODO: fix this... except UnicodeEncodeError: print 'title cannot be printed - containing unicode encode error' return [], {}, 0 fetch = metapub.PubMedFetcher() pmids = fetch.pmids_for_query(title) if len(pmids) == 1: article = fetch.article_by_pmid(pmids[0]) if edit_distance(article.title, title) <= len(title) * 0.1: print 'matched title: ' + article.title.encode('utf-8') related_pmids = fetch.related_pmids(pmids[0]) return _merge_related_pmids(pmids[0], related_pmids, fetch) elif len(pmids) > 1: for i in range(min(20, len(pmids))): article = fetch.article_by_pmid(pmids[i]) if edit_distance(article.title, title) <= len(title) * 0.1: print 'matched title: ' + article.title.encode('utf-8') related_pmids = fetch.related_pmids(pmids[i]) return _merge_related_pmids(pmids[i], related_pmids, fetch) print 'no match found' return [], {}, 0
def find_eng_neighbour(inpword, c, data, lsh, soundex_eng): minhash = MinHash(num_perm=32) word = soundex_eng.soundex(inpword) test_i = word for d in word: d = d.encode("utf-8") minhash.update(d) results = lsh.query(minhash) min_res = 999 indx = -1 for i in results: res = edit_distance(data[i][0], test_i) if min_res > res: indx = i min_res = res if min_res == 0: break if indx == -1: return "nomatch" word = data[indx][0] #count = c.execute("select max(count) from dev_table where soundex = '"+word+"'") #a = list(count) dev_word = c.execute("select eng from eng_table where soundex = '" + word + "'") dev_word = list(dev_word) min_res = 999 for word in dev_word: word = word[0] res = edit_distance(word, inpword) if min_res > res: final_word = word min_res = res return final_word
def checkSentenceTri(sentence, size, ref, ref_tot): trueWord = ref totalNumberOfWords = ref_tot for a in range(0, size - 2): bestSuite = {} checkWord1 = sentence.split(' ')[a] checkWord2 = sentence.split(' ')[a + 1] checkWord3 = sentence.split(' ')[a + 2] checkWord = checkWord1 + ' ' + checkWord2 + ' ' + checkWord3 bestSuite[checkWord] = [] for i in trueWord.keys(): word = i if checkWord in trueWord: bestSuite.pop(checkWord) break if (edit_distance(word, checkWord) <= 5) and (edit_distance( word, checkWord) > 2) and (not checkWord in trueWord): bestSuite[checkWord].append({word: int(trueWord[i])}) for key in bestSuite: if (key == ''): break for dic_in_list in bestSuite[key]: for kay in dic_in_list: print( str(key) + " ---> " + str(kay) + " " + str(dic_in_list[kay] / totalNumberOfWords)) return
def get_abstract_by_title(title): print '>>>>>>>>>>>>>>>>>>>>>>>>>>' print 'searching entry with title: ' + title fetch = metapub.PubMedFetcher() pmids = fetch.pmids_for_query(title) if (len(pmids) == 0): print 'warning: no entry retrieved for given title' return None, '' elif (len(pmids) == 1): article = fetch.article_by_pmid(pmids[0]) if edit_distance(article.title, title) <= math.ceil(len(title) * 0.1) and article.abstract != None: print 'successfully matched title: ' + article.title return article.title, article.abstract else: print 'warning: found one entry but not a match' return None, '' else: print 'warning: retrieved more than one entry for given title' for i in range(min(20, len(pmids))): article = fetch.article_by_pmid(pmids[i]) if edit_distance(article.title, title) <= math.ceil(len(title) * 0.1) and article.abstract != None: print 'successfully matched title: ' + article.title return article.title, article.abstract print 'warning: no entry is a match' return None, ''
def matches_author(self, string, fuzzy=False, distance_threshold=3): """ This function retrieves from the KnowledgeBase possible authors that match the search string. None is returned if no matches are found. :param string: the string to be matched :param fuzzy: whether exact or fuzzy string matching should be applied :distance_threshold: the maximum edit distance threshold (ignored if `fuzzy==False`) :return: a list of tuples, ordered by distance between the seach and the matching string, where: tuple[0] contains the id (i.e. CTS URN) of the matching author tuple[1] contains a label of the matching author tuple[2] is the distance, measured in characters, between the search string and the matching string or None if no match is found. """ #string = string.lower() author_matches, abbr_matches = [],[] if(not fuzzy): author_matches = [(id.split("$$")[0] , self._author_names[id] , len(self._author_names[id])-len(string)) for id in self._author_idx.searchAllWords(string)] abbr_matches = [(id.split("$$")[0] , self._author_abbreviations[id] , len(self._author_abbreviations[id])-len(string)) for id in self._author_abbr_idx.searchAllWords(string)] else: abbr_matches = [(id.split("$$")[0] , self._author_abbreviations[id] , edit_distance(string,self._author_abbreviations[id])) for id in self._author_abbreviations if edit_distance(string,self._author_abbreviations[id]) <= distance_threshold] abbr_matches = sorted(abbr_matches, key =itemgetter(2)) author_matches = [] for id in self._author_names: if(string.endswith(".")): if string.replace(".","") in self._author_names[id]: if(len(string) > (len(self._author_names[id]) / 2)): try: assert abbr_matches[0][2] == 0 distance = len(self._author_names[id]) - len(string) if distance < 0: distance = 1 author_matches.append((id.split("$$")[0], self._author_names[id],distance)) except Exception, e: author_matches.append((id.split("$$")[0], self._author_names[id],0)) else: if(edit_distance(string,self._author_names[id]) <= distance_threshold): author_matches.append((id.split("$$")[0], self._author_names[id], edit_distance(string,self._author_names[id]))) else: if(edit_distance(string,self._author_names[id]) <= distance_threshold): author_matches.append((id.split("$$")[0], self._author_names[id], edit_distance(string,self._author_names[id])))
def spell_correction(document, vocab): ######################with suggestions with open(vocab, 'rb') as f: vocab = pickle.load(f) with open(document, 'rb') as f: rawtext = pickle.load(f) tokens = nltk.WordPunctTokenizer().tokenize(rawtext) tokens = [x for x in tokens if x] #print (tokens) #print(vocab) wrongwords = [word for word in tokens if word not in vocab] #print (wrongwords) error_location = [x for x, _ in enumerate(tokens) if _ in wrongwords] #print(wrongwords) suggestions = {} mindistances_sugg = {} bestmatch = {} for word in wrongwords: mindistances = [] mindistances_word = [] for v in vocab: if edit_distance(word, v) <= 4: mindistances.append(v) mindistances_word.append(edit_distance(word, v)) suggestions[word] = mindistances try: mindistances_sugg[word] = min(mindistances_word) except: pass #print(mindistances_word) for word in (wrongwords): dist = mindistances_sugg[word] key = [x for x in suggestions[word] if edit_distance(word, x) == dist] bestmatch[word] = key #print(bestmatch) big_table = list(zip(wrongwords, error_location, bestmatch.values())) df = pd.DataFrame(big_table) df.columns = ["wrongspellings", "location", "correction"] #print (error_location) #print(big_table) for word in wrongwords: print(word + "....?", "did you mean .........", suggestions[word]) print( "best match ......", bestmatch[word], "please do humanity a favor and go to school boy ....................." ) return df
def match(): second = session.attributes['lyric line'] next_line, artist_song = read_songs() query = second.lower() q = min(artist_song.keys(), key=lambda x: edit_distance(x, second)) dist = edit_distance(q, query) print(dist) if dist > 10: pass return question(str(artist_song[q][0]) + ': ' + str(artist_song[q][1]))
def fixstring(string): # no string, return if len(string) == 0: return string # split words in string else: stringlist = string.split() # add stuff to new string newstring = '' old_st = '' for st in stringlist: # drop repeats if st == old_st: continue old_st = st # determine if real word if wordnet.synsets(st): newstring += st + ' ' continue # determine if almost a real word try: fixword = spell(st) if wordnet.synsets(fixword): # word is now real # only keep words that are 3 or less edits apart from real if edit_distance(st, fixword) < 4: newstring += fixword + ' ' continue except: pass # determine if number try: float(st) # only keep smaller numbers if len(st) < 4: newstring += st + ' ' continue except: pass # determine if money or percent if '$' in st or '%' in st: money = st.replace('$', '').replace('%', '') try: float(money) newstring += st + ' ' continue except: pass # if proper noun, keep if edit_distance(st, st.lower()) == 1: if all(char.isalpha() for char in st): newstring += st + ' ' continue # return fixed string return newstring
def next_line(): second = session.attributes['lyric line'] next_line, artist_song = read_songs() query = second.lower() q = min(artist_song.keys(), key=lambda x: edit_distance(x, second)) dist = edit_distance(q, query) print(dist) if dist > 10: pass msg = next_line[q] return question(msg)
def merge(raw_dbr, mention_1, mention_2): dbr = raw_dbr.lower() if mention_1 is None or type(mention_1) is not str: return mention_2 if mention_2 is None or type(mention_2) is not str: return mention_1 ed_1 = edit_distance(mention_1.lower(), dbr) ed_2 = edit_distance(mention_2.lower(), dbr) ret = mention_2 if ed_1 < ed_2: ret = mention_1 return ret
def closest_word(word, vocab, threshold=5, sub_thres=2): '''Finds closest word in the vocabulary (w.r.t. edit distance) Returns 2 words if no closest word found ''' best_word = word best_dist = float("inf") prefix_len_best = float("inf") for vocab_word in vocab: curr_dist = edit_distance(word, vocab_word) if curr_dist < best_dist: best_dist = curr_dist best_word = vocab_word prefix_len_best = len(os.path.commonprefix([word, vocab_word])) elif curr_dist == best_dist and abs(len(best_word) - len(word)) > abs( len(vocab_word) - len(word)): prefix_len_vocab = len(os.path.commonprefix([word, vocab_word])) if prefix_len_best < prefix_len_vocab: best_word = vocab_word prefix_len_best = prefix_len_vocab if best_dist > 5: # margin of error is sub_thres for each subword for i in range(len(word) - 1): word1 = word[:i + 1] word2 = word[i + 1:] curr_dist = float("inf") vocab_word1 = word1 for vocab_word in vocab: if word1 == vocab_word: vocab_word1 = vocab_word curr_dist = 0 break dist1 = edit_distance(word1, vocab_word) if dist1 < curr_dist: vocab_word1 = vocab_word curr_dist = dist1 vocab_word2 = word2 if curr_dist <= sub_thres: curr_dist2 = float("inf") for vocab_word in vocab: if word2 == vocab_word: vocab_word2 = vocab_word curr_dist2 = 0 break dist2 = edit_distance(word2, vocab_word) if dist2 < curr_dist2: vocab_word2 = vocab_word curr_dist2 = dist2 curr_dist += curr_dist2 if curr_dist < best_dist: best_word = vocab_word1 + ' ' + vocab_word2 best_dist = curr_dist return best_word
class SpellingReplacer(object): def __init__(self, dict_name='en', max_dist=2): self.spell_dict = enchant.Dict(dict_name) self.max_dist = max_dist def replace(self, word): if self.spell_dict.check(word): return word suggestions = self.spell_dict.suggest(word) if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist: return [sugst for sugst in suggestions if edit_distance(word,sugst) <= self.max_dist] else: return word
def asciiSpell(word): spell_dict = enchant.Dict('en_US') max_dist = 2 if spell_dict.check(word): return word suggestions = sorted(spell_dict.suggest(word), key=lambda sugg: edit_distance(sugg, word) * 0 if sameletters(word, sugg) else 1) try: if edit_distance(suggestions[0], word) <= max_dist: return suggestions[0] except: pass return word
def searchEvidenceByTitle(request): if request.method == 'POST': data = json.loads(request.body) collection_id = data['collection_id'] title = data['title'] result_limit = data['result_limit'] include_personal = data['include_personal'] user_id = data['user_id'] # DONE: we can alternatively change this to treat given title as a series of separated terms title_terms = title.split(' ') print title_terms evidence = Evidence.objects.filter(Q(created_by=collection_id)&reduce(lambda x, y: x & y, [Q(title__icontains=word) for word in title_terms])) if include_personal: personal_evidence = Evidence.objects.filter(Q(created_by=user_id)&reduce(lambda x, y: x & y, [Q(title__icontains=word) for word in title_terms])) evidence = chain(evidence, personal_evidence) serialized_json = serializers.serialize('json', evidence) evidence_json = flattenSerializedJson(serialized_json) evidence = json.loads(evidence_json) pprint.pprint(evidence) for e in evidence: e['dist'] = edit_distance(title, e['title']) print 'result limit' print result_limit evidence = sorted(evidence, key=lambda e:e['dist'])[:result_limit] for e in evidence: e['topic'] = -1 try: e['topic'] = EvidenceTopic.objects.get(evidence=e['id']).primary_topic except ObjectDoesNotExist: if len(e['abstract']) > 50: name = Collection.objects.get(collection_id=collection_id).collection_name topic_dist, primary_topic_terms = TopicModeler.get_document_topics(e['abstract'], name) primary_topic_tuple = max(topic_dist, key=lambda x:x[1]) e['topic'] = primary_topic_tuple[0] else: print 'warning: evidence with no topic' return HttpResponse(json.dumps(evidence), status=status.HTTP_200_OK) elif request.method == 'GET': collection_id = 13 title = 'UpSet: Visualization of Intersecting Sets' evidence = Evidence.objects.filter(created_by=collection_id) serialized_json = serializers.serialize('json', evidence) evidence_json = flattenSerializedJson(serialized_json) evidence = json.loads(evidence_json) for e in evidence: e['dist'] = edit_distance(title, e['title']) evidence = sorted(evidence, key=lambda e:e['dist']) return HttpResponse(json.dumps(evidence[:20]), status=status.HTTP_200_OK)
def fun_1_5_1(): # 编辑距离pyton实现 def _edit_dist_init(len1, len2): lev = [] for i in range(len1): lev.append([0] * len2) # initialize 2D array to zero for i in range(len1): lev[i][0] = i # column 0:0,1,2,3,4,...... for j in range(len2): lev[0][j] = j # row 0:0,1,2,3,4,...... return lev def _edit_dist_step(lev, i, j, s1, s2, transpositions=False): c1 = s1[i - 1] c2 = s2[j - 1] # skipping a character in s1 a = lev[i - 1][j] + 1 # skipping a character in s2 b = lev[i][j - 1] + 1 # substitution c = lev[i - 1][j - 1] + (c1 != c2) # transposition d = c + 1 # never picked by default if transpositions and i > 1 and j > 1: if s1[i - 2] == c2 and s2[j - 2] == c1: d = lev[i - 2][j - 2] + 1 # pick the cheapest lev[i, j] = min(a, b, c, d) def edit_distance(s1, s2, transportsitions=False): # set-up a 2-D array len1 = len(s1) len2 = len(s2) lev = _edit_dist_init(len1 + 1, len2 + 1) # iterate over the array for i in range(len1): for j in range(len2): _edit_dist_step(lev, i + 1, j + 1, s1, s2, transportsitions=transportsitions) return lev[len1][len2] import nltk from nltk.metrics import edit_distance print edit_distance('relate', 'relation') print edit_distance("suggestion", "calculation")
def __match_distance__(self): ''' Match node lemma with lemmas in text of shortest string distance :return: ''' fnodes = filter(lambda node: node not in self.solved, self.nodes) for node in fnodes: lemma = self.nodes[node]['lemma'] # get the position of the nodes already solved order_ids = map(lambda node: self.nodes[node]['order_id'], self.solved) candidates = filter(lambda x: x.i not in order_ids, self.doc) candidates = map(lambda x: (x, edit_distance(x.lemma_, lemma)), candidates) candidates.sort(key=lambda x: x[1]) if len(candidates) > 0: order_id, realization = candidates[0][0].i, unicode(candidates[0][0]) self.nodes[node]['order_id'] = order_id self.nodes[node]['realization'] = realization #+ u'_dist' # add in lexicon # self.__add_lexicon__(node, realization) else: self.nodes[node]['order_id'] = -1 self.nodes[node]['realization'] = lemma #+ u'_dist' self.solved.append(node)
def replace(self, word): if self.spell_dict.check(word): return word suggestions = [] suggestions = self.spell_dict.suggest(word) distance = [] print(distance) print(suggestions) retVal = "" for suggestedWord in suggestions: distance.append(edit_distance(word, suggestedWord)) print(distance) lengthMatched = False if min(distance) <= self.max_dist: retVal = suggestions[distance.index(min(distance))] i = 0 for ed in distance: if ed == min(distance) : if len(word) == len(suggestions[i]) and lengthMatched == False: retVal = suggestions[i] lengthMatched = True i += 1 else : retVal = word return retVal
def _GetScore(self, query, match): """Custom edit-distance based scoring.""" str_query = str(query) str_candidate = str(match.key) dist = float(edit_distance(str_query, str_candidate)) max_len = float(max(len(str_query), len(str_candidate))) return (max_len - dist) / max_len
def string_matching(label1, label2): #by Maedchen and Staab """ (string, string) -> float Return the coefficient of similarity between two sequence of strings based on the Levenshtein distance (edit distance). It equates 1 for exact match and 0 to no similarity. >>> string_matching('power','power') 1.0 >>> string_matching('power','abba') 0.0 """ sm = float( min(len(label1),len(label2)) - edit_distance(label1, label2) ) / min(len(label1),len(label2) ) try: if sm < 0: return 0.0 else: return sm except: print "Error found:" traceback.print_exc(file=sys.stdout) return 0
def process(self, statement): ed = 9999 from chatterbot.conversation import Statement res = str(statement.text).split() options = [] strr = "" with open('dict.csv', 'rb') as csvvfile: csvreader = csv.reader(csvvfile, delimiter=str(',')) print "Word to be corrected is " + res[0] for row in csvreader: for col in row: k = edit_distance(res[0], col) #print "Word compared is "+str(col) #print "Edit distance is "+str(k) if (k < 3): options.append(col) aux = strr.split() if col not in aux: strr = strr + " " + col print col response = Statement( "The query you entered seems wrong.Try possible options like " + strr) response.remove_response( "The query you entered seems wrong.Try possible options like " + strr) response.confidence = 1 return response
def fuzzy_comparison(tokens_1,tokens_2,max_dist=1): """ compares the tokens based on fuzzy match """ matched = 0 matched_len_1 = init_term_1 - len(tokens_1) matched_len_2 = init_term_2 - len(tokens_2) for token in reversed(tokens_1): if len(token)<=2: tokens_1.remove(token) continue for tkn in reversed(tokens_2): if len(tkn)<=2: tokens_2.remove(tkn) continue if metrics.edit_distance(token, tkn) <= max_dist: matched = matched + 1 logging.debug("Match found for:"+token+" - "+tkn) tokens_2.remove(tkn) tokens_1.remove(token) break logging.info("Fuzzy match count:"+str(matched)) score_1 = (matched_len_1 + matched)/float(init_term_1) score_2 = (matched_len_2 + matched)/float(init_term_2) return score_1,score_2
def process_spell_errors(self, query): """ Process the query string and replace spell errors with words from the corpus / english dictionary. query: A query string. """ if config_params['spell_check']: split_query = query.split() result = [] words_list = set(words.words()).union(data_dict['word_corpus']) for word in split_query: if word not in words_list and '*' not in word: print(colorize.magenta("%s is not in dict" % word)) #process words_distance = zip( words_list, map(lambda x: edit_distance(word, x), words_list)) best_word = reduce(lambda x, y: x if x[1] <= y[1] else y, words_distance)[0] word = best_word print(colorize.green("replaced with %s" % word)) result.append(word) query = result return " ".join(query) return query
def spellChecker(sentences, file_name_s): dict_name = 'en_GB' spell_dict = enchant.Dict(dict_name) max_dist = 3 corrected = [] csv_writer = csv.writer(open(file_name_s, 'wb')) #csv_writer.writerow(HEADER2) for sentence in sentences: corrected_sent = '' sentence = str(sentence) sc = set(["[", "]", "'", '"']) words = ''.join([c for c in sentence if c not in sc]) words = words.split() #print words for word in words: print word suggestions = spell_dict.suggest(word) #print suggestions[0] #print edit_distance(word, suggestions[0]) if suggestions and edit_distance(word, suggestions[0]) <= max_dist: #print word corrected_sent = corrected_sent + " " + suggestions[0] else: corrected_sent = corrected_sent + " " + word corrected_sent.replace("[", "") corrected_sent.replace("]", "") corrected_sent.replace("'", "") #print corrected_sent corrected.append(corrected_sent) csv_writer.writerow([corrected_sent]) print corrected
def get_hosts_helper(tweets): host_re = re.compile('host [A-Z][a-z]* [A-Z][a-z]*') all_hosts = dict() tweets = tweets.__dict__ for key, tweetObj in tweets.items(): # nltk.download('punkt') # words = nltk.word_tokenize(tweet) # host_index = words.index("host") # return words[host_index + 1] + " " + words[host_index + 2] tweet = ' '.join(tweetObj.words) possible_host_match = host_re.search(tweet) possible_host = '' if possible_host_match: possible_host = tweet[possible_host_match.start() + 5 : possible_host_match.end()] if possible_host in all_hosts: all_hosts[possible_host] = all_hosts[possible_host] + 1 else: all_hosts[possible_host] = 1 gg_reactions.extract_reaction('hosts', 'host', ' '.join(tweetObj.words)) top_hosts = (sorted(all_hosts.items(), key=lambda x: x[1], reverse=True))[:2] most_likely_host = [top_hosts[0][0]] dist = edit_distance(most_likely_host[0].lower(), top_hosts[1][0].lower()) relative_mention_amount = top_hosts[1][1] / top_hosts[0][1] if dist >= 5 and relative_mention_amount > 0.60: most_likely_host.append(top_hosts[1][0]) return most_likely_host
def replace(self, word): if self.spell_dict.check(word): return word suggestions = [] suggestions = self.spell_dict.suggest(word) distance = [] print(distance) print(suggestions) retVal = "" for suggestedWord in suggestions: distance.append(edit_distance(word, suggestedWord)) print(distance) lengthMatched = False if min(distance) <= self.max_dist: retVal = suggestions[distance.index(min(distance))] i = 0 for ed in distance: if ed == min(distance): if len(word) == len( suggestions[i]) and lengthMatched == False: retVal = suggestions[i] lengthMatched = True i += 1 else: retVal = word return retVal
def correctSpell(word): suggestions = cf.hobj.suggest(word) if len(suggestions) != 0: distance = [edit_distance(word, s) for s in suggestions] return suggestions[distance.index(min(distance))] else: return word
def spellChecker(sentences, file_name_s): dict_name = 'en_GB' spell_dict = enchant.Dict(dict_name) max_dist = 3 corrected = [] csv_writer = csv.writer(open(file_name_s, 'wb')) #csv_writer.writerow(HEADER2) for sentence in sentences: corrected_sent = '' sentence = str(sentence) sc = set(["[", "]", "'", '"']) words = ''.join([c for c in sentence if c not in sc]) words = words.split() #print words for word in words: print word suggestions = spell_dict.suggest(word) #print suggestions[0] #print edit_distance(word, suggestions[0]) if suggestions and edit_distance(word, suggestions[0]) <= max_dist: #print word corrected_sent = corrected_sent + " " + suggestions[0] else: corrected_sent = corrected_sent + " " + word corrected_sent.replace("[","") corrected_sent.replace("]","") corrected_sent.replace("'","") #print corrected_sent corrected.append(corrected_sent) csv_writer.writerow([corrected_sent]) print corrected
def replace(self, word): if self.spell_dict.check(word): return word distance = [] suggestions = [] suggestions = self.spell_dict.suggest(word) retVal = "" for suggestedWord in suggestions: distance.append(edit_distance(word, suggestedWord)) if min(distance) <= self.max_dist: retVal = suggestions[distance.index(min(distance))] i = 0 for ed in distance: if ed == min(distance) : if len(word) == len(suggestions[i]): retVal = suggestions[i] break i += 1 else: retVal = word return retVal
def get_top_from_edit_distance(word, suggested_words): """ Based on edit distance in counts the best candidates to replace :param word: :param suggested_words: :return: top-10 closest to original word """ top = {} for suggested_word in suggested_words: value = edit_distance(word, suggested_word) if value not in top: top[value] = [suggested_word] else: if suggested_word not in top[value]: top[value] += [suggested_word] sorted_top = dict(sorted(top.items(), key=lambda x: x[0])) for k, v in sorted_top.items(): sorted_top[k] = sorted(v, key=lambda x: abs(len(word) - len(x))) result = [] for elements_array in sorted_top.values(): result += elements_array return result[:10]
def replace(self, word): suggestions = self.spell_dict.suggest(word) if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist: return suggestions[0] else: return word
def typo(addr, tar_str): tStart = time.time() file_js = dict() with open(addr, 'r') as f_stream: f_str = f_stream.read() if is_json(f_str): file_js = json.loads(f_str) inv_idx = file_js['tokens_o'] tar_str = normalize_word(tar_str) result_li = [] for it in inv_idx: result_li.append((edit_distance(tar_str, normalize_word(it)), it)) result_li = sorted(result_li) return_li = [] num_res = len(result_li) lim = min(5, num_res) for idx in range(lim): return_li.append({'dist': result_li[idx][0], 'str': result_li[idx][1]}) tEnd = time.time() print("\n typo \nIt cost %f sec" % (tEnd - tStart)) return return_li
def edit_dis(list): """Retrieves the similar question in the test data with a users question""" # retrive dataframe df = generate_pairs() # convert to list for modeling x = df.Question.tolist() y = df.Answer.tolist() predicted = [] for u_sent in list: x_index = -1 # initialize index for tracking index of similar question ini_val = 1000 # initialize edit distance for similar question for i in range(len(x)): # calculate edit distance val_dis = edit_distance(u_sent[0].split(), x[i].split()) if(val_dis < ini_val): ini_val = val_dis x_index = i predicted.append(y[x_index]) return predicted
def service_tag(self, text, print_word=False): ''' text: string input output: 0 or 1 ''' if self.tagger == None: self.tagger = [] try: with open( os.path.abspath('utils') + '/DictionaryUtils/service_tagger.txt', 'r') as fp: data = fp.read().lower() self.tagger = set(data.split('\n')) except: print('Warning: Service_tagger.txt not read') pass self.tagger = set(self.tagger) if '' in self.tagger or ' ' in self.tagger: self.tagger.pop() k = text.split() for w in k: for wrd in self.tagger: x = edit_distance(w.lower(), wrd) if x <= 1: if print_word == True: print(wrd) return 1 return 0
def pun(sent, cat): """"" THIS IS THE FUNCTION YOU HAVE TO WRITE It takes an expression and a category as input, Chooses a word in the expression, Find a word related to the category that sounds similar, replace the word chosen by this similar sounding word, to build a new expression """ "" #first slice the string and choose a word in it, maybe you need to clean it from punctuation? sent = re.sub(r'[^\w\s]', '', sent) #everything that is not a word - replace with nothing sent = sent.split() #split into words word_of_interest = sent[2] #second, load the category as a list- drink or food cat_list = category(cat) print(cat_list) #careful, not all words are in the spelling dictionnary, make sure to only keep those that are if word_of_interest in arpabet: word_of_interest = word_of_interest else: print("word not found in dictionary") return False #third, translate the list of words into a list of their phonetic representation translated_category = [pronounce(word) for word in cat_list] #fourth, create a list of distances (use the edit_distance function from nltk) distance_list = [ edit_distance(word, translated_category) for word in translated_category ] print(distance_list)
def replace_word(self, word): if self.dictionary.check(word): return word suggestions = self.dictionary.suggest(word) if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist: return suggestions[0]
def replace(self,word): if self.spell_dict.check(word): return word suggestions = self.spell_dict.suggest(word) if suggestions and edit_distance(word, suggestions[0]) <=self.max_dist: return suggestions[0] else: return word
def check_replace_word(word): if spell_dict.check(word): return word suggestions = spell_dict.suggest(word) if suggestions and edit_distance(word, suggestions[0]) < 2: return suggestions[0] else: return word
def spell_check(r, a, s, scores, weight=1): change = weight*(1-(edit_distance(r, a)/float(max(len(r), len(a))))) if s in scores: # penalty for returning multiple of the same result when # one instance is incorrectly spelled return (scores[s] + change)/2.0 else: return change
def ordered_content_distance(self, sentence, normalized=True): """Normalized levenshtein distance on (ordered) content words between `self` and `sentence`.""" self_content_words = self.content_words sentence_content_words = sentence.content_words distance = edit_distance(self_content_words, sentence_content_words) norm = max(len(self_content_words), len(sentence_content_words)) return distance / norm if normalized else distance
def raw_distance(self, sentence, normalized=True): """Normalized levenshtein distance between `self.text` and `sentence.text`.""" self_text = self.text sentence_text = sentence.text distance = edit_distance(self_text, sentence_text) norm = max(len(self_text), len(sentence_text)) return distance / norm if normalized else distance
def get_string_similarity(p_token, h_token): distance = edit_distance(h_token, p_token) max_length = max(len(h_token), len(p_token)) score = 0 if max_length > 2: score = 1 - (distance / (max_length - 1.99999999999999)) #if score > 1: #logging.warning('score > 1 for %s, %s' % (p_token, h_token)) return max(0, score)
def similar(self, word): names = self.table_names() + self.column_names() + self.row_names() best = 100 best_word = None for name in names: dist = edit_distance(name, word) if dist <= best: best,best_word = dist,name #print "Best word: " + best_word + " for " + word + ". Distance: " + str(dist) return best_word
def spell_correct(unigrams, Dict): for raword in unigrams: if not (raword == "" or (raword[0] == '@' or raword[0] == '#')): #Type error suggestions = Dict.suggest(raword) if suggestions and not Dict.check(raword): if edit_distance(suggestions[0], raword) < 2: raword = suggestions[0] return unigrams
def chug(): for title in dir_ocr: with open(ocr + title, "r") as o_open: with open(lines, "r") as l_open: # lists of lines for each doc. o_open_r = o_open.readlines() l_open_r = l_open.readlines() tot_o_line = len(o_open_r) tot_l_line = len(l_open_r) o_line = 0 for o in o_open_r: # strip ocr lines of punctuation/whitespace d = {} o_1 = p.depunc(o.decode("utf-8")) l_line = 0 o_line += 1 for l in l_open_r: # strip 'known' lines of punctuation/whitespace l_1 = p.depunc(l.decode("utf-8")) # ignore ocr lines with few characters, still count the line thought if len(o_1) < 4: l_line += 1 # don't compare ocr lines less than half or over twice the length of the reference 'known' line(does this improve performance?) elif len(o_1) < 0.5 * len(l_1) or len(o_1) > 1.5 * len(l_1): l_line += 1 # compare ocr and known lines, get a similarity value between 0(not similar) and 1 (exact match), insert line pairs into dictionary else: l_line += 1 x = len(o_1) + len(l_1) dist = (x - metrics.edit_distance(o_1, l_1)) / (x) d[ '"' + str( title + "| " + str(o_line) + '","' + o.rstrip("\n") + '","' + "line: " + str(l_line) + '","' + l.rstrip("\n") + '"' ) ] = dist # keep the top score in the dictionary for each ocr line. Append to file. if len(d) > 0 and (max(d.values())) > 0.85: m = d.keys()[d.values().index(max(d.values()))] f = open(output, "a") f.write(str(m) + "," + str((max(d.values()))) + "\n") print str(m).decode("utf-8") + ",", (max(d.values())) l_open.close() o_open.close() f.close()
def between(a, b): """Returns the edit distance between two strings. >>> EditDistance.between('abc', 'abc') 0 >>> EditDistance.between('abc', 'def') 3 >>> EditDistance.between('abcd', 'abef') 2 """ return edit_distance(a, b)
def main(argv): inputfile = '' inputheader = '' try: opts, args = getopt.getopt(argv,"hi:d:",["ifile=","iheader="]) except getopt.GetoptError: print 'test.py -i <inputfile> -o <inputheader>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'test.py -i <inputfile> -d <outputfile>' sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-d", "--iheader"): inputheader = arg print inputfile header = '##engine ' body = '' tree = ET.parse(inputfile) root = tree.getroot() for headernode in root.findall('header'): for tool in headernode.iter('tool'): body = body + tool.attrib["engine"] + ' ' #body = body + tool.attrib["engine"] + ' ' for word in root.iter('word'): original = word.find('original').text header = header + original + ' ' status = word.find('status').text if status == 'SplErr': expected = word.find('expected').text # a is the misspelled word a = original # c is the reference word c = expected count = 0 total = 0 for suggestion in word.iter('suggestion'): # b is one of the suggestions offered # by the spell checker engine b = suggestion.text count = count + 1 total = total + round(edit_distance(a,b)/len(c), 2) result = total / count body = body + str(round(result,3)) + ' ' else: body = body + '0 ' f = open('test.dat', 'a') if (inputheader == 'true'): f.write(header+'\n') f.write(body+'\n')
def str_common_word(str1, str2): str1, str2 = str1.lower(), str2.lower() words, cnt, words2 = str1.split(), 0, str2.split(), for word in words: if len(words2) < 10 and len(words) < 4: for word2 in words2: if edit_distance(word, word2, transpositions=False) <= 1: cnt += 1 else: if str2.find(word) >= 0: cnt += 1 return cnt
def spellcheck(wordtoken): if wordtoken == "": return wordtoken if DICT.check(wordtoken) == False: suggestions = DICT.suggest(wordtoken) if suggestions: for suggestion in suggestions: if edit_distance(wordtoken, suggestion) <= 2: return suggestion return wordtoken
def str_common_word(str1, str2): words, cnt = str1.split(), 0 for word in words: if str2.find(word)>=0: cnt+=1 # new for edit distance if cnt == 0 and len(word)>3: s1 = [z for z in list(set(str2.split(" "))) if abs(len(z)-len(word))<2] t1 = sum([1 for z in s1 if edit_distance(z, word)<2]) if t1 > 1: cnt+=0.5 return cnt
def getOrthographicVariants(word, limit=20): """Use flookup and the orthographicvariation FST to get possible alternate spellings/transcriptions of the word. Return these ranked by their minimum edit distance from the word. """ print '\n\n\nTRYING TO GET VARIANTS FOR: %s\n\n\n' % word # Check to see if we have the orthographic variation FST file if orthographicVariationBinaryFileName not in os.listdir(parserDataDir): return [] # Check to see if the nltk module is installed try: from nltk.metrics import edit_distance except ImportError: return [] # Get variants from flookup word = u'#%s#' % word orthographicVariationBinaryFilePath = os.path.join( parserDataDir, orthographicVariationBinaryFileName) process = subprocess.Popen( ['flookup', '-x', '-i', orthographicVariationBinaryFilePath], shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE) process.stdin.write(word.encode('utf-8')) result = unicode(process.communicate()[0], 'utf-8').split('\n') #print 'Number of results from flookup: %d' % len(result) # Remove results that are too long or too short margin = 2 if len(result) > 1000: margin = 1 result = [x for x in result if len(x) < len(word) + 2 and len(x) > len(word) -2] #print 'Number of results needing edit distancing: %d' % len(result) # Sort variants by minimum edit distance result = [(x, edit_distance(word, x)) for x in result] result.sort(key=lambda x: x[1]) # Take only the top <limit> # of results result = result[:limit] # Remove the first result if it has a MED of 0 if result[0][1] == 0: result = result[1:] result = [x[0][1:-1] for x in result if x] # Remove hash symbols return result
def get_sim(analys_dict): sim_dict = {} name_list = analys_dict.keys() for name1 in name_list: for name2 in name_list: dist = edit_distance(name1,name2) if(dist < 3 and dist > 0): sim_dict[analys_dict[name1]]= analys_dict[name2] name_list.remove(name2) print "%d %s : %d %s" % (analys_dict[name1], name1, analys_dict[name2], name2) return sim_dict
def min_distance_one(original, find): #check the colour and name if original == None: return None new = None min_dist = 100 for j in find: distance = edit_distance(original, j, transpositions=False) if distance < min_dist: min_dist = distance new = j #print "Name we are looking for: %s. It is similar to: %s"%(new, original) return new
def get_close_matches(word, possibilities, n=3, cutoff=None, case_sensitive=False): """ Given a string and a list of strings to lookup, returns a list of pairs of the n-closest strings (according to the edit-distance), and their respective distances from 'word'. If cutoff is given, the returned list will contain only the string which are closer than the cutoff. """ try: from nltk.metrics import edit_distance except ImportError: from Levenshtein import distance as edit_distance hits = [] for possibility in possibilities: if case_sensitive: d = edit_distance(word, possibility) else: d = edit_distance(word.lower(), possibility.lower()) if cutoff and d < cutoff: hits.append((possibility, d)) return sorted(hits, cmp=lambda x,y: cmp(x[1], y[1]))
def misspell_distance(self, word): if self.spell_dict.check(word): return 0 if word.isdigit(): return 0 suggestions = self.spell_dict.suggest(word) if suggestions: # print >> sys.stderr, "%r => %r" % (word, suggestions[0]) ed = edit_distance(word, suggestions[0]) if ed: return ed else: return self.max_dist