def getTreffer(self): #liste von ids wird zurückgegeben #print "get Treffer" daten = self.Datenbank.getDataAsList("select deutsch, fremd from vokabeln where id like "+ str(self.ids)) #print "vergleich zwischen "+str(daten[0][1]) +" und "+str(self.wort) if self.richtung == 1: if leve.distance(daten[0][1], self.wort) <= int(self.distanz) and leve.jaro(daten[0][1], self.wort) > round((self.minTreffer/100), 2): self.direktTreffer = True #print self.ids return [self.ids] else: if leve.distance(self.Vergeleichsfaehigkeit(daten[0][0]), self.Vergeleichsfaehigkeit(self.wort)) <= int(self.distanz) \ and leve.jaro(self.Vergeleichsfaehigkeit(daten[0][0]), self.Vergeleichsfaehigkeit(self.wort)) > round((self.minTreffer/100), 2): #print "Leven Vergleich zwischen "+ str(daten[0][0])+ " und "+ str(self.wort) self.direktTreffer = True #print self.id return [self.ids] rueckgabe = [] for i in self.liste: #print "Aktueller vergleich "+unicode(i[0]) +" und "+unicode(self.wort) if leve.distance(i[0], self.wort) <= int(self.distanz) and leve.jaro(i[0], self.wort) > 0.7: rueckgabe.append(i[1]) return rueckgabe
def find_by_distance(): word_nl = getNormalWord(word).lower() min_dist = 10 min_dist_word_idx = wordIdx import Levenshtein for i in range(0, self.mTotalEntries): word_ = getNormalWord(self.getWord(i)).lower() dist = Levenshtein.distance(word_nl, word_) if dist == 1: return i if dist < min_dist: min_dist = dist min_dist_word_idx = i for i in range(0, self.derived_dict.mTotalEntries): word_ = getNormalWord(self.derived_dict.getWord(i)).lower() dist = Levenshtein.distance(word_nl, word_) if dist == 1: words = self.derived_dict.getExplanations(word_) return self.getWordIdxInternal(words[0]) if dist < min_dist: min_dist = dist words = self.derived_dict.getExplanations(word_) min_dist_word_idx = self.getWordIdxInternal(words[0]) return min_dist_word_idx
def _match_user_agent(cls, user_agent): device = cls.objects.filter(user_agent=user_agent).order_by("-actual_device_root")[:1] if len(device): return device[0] else: if settings.UA_PREFIX_MATCHING: # Try more flexible matching, 1 third of the UA string ds_user_agent = user_agent[: len(user_agent) // 3] devices = cls.objects.filter(user_agent__startswith=ds_user_agent) devices = devices.order_by("-actual_device_root")[: settings.UA_PREFIX_MATCHING_LIMIT] if len(devices): user_agent = force_unicode(user_agent) best = reduce( lambda x, y: Levenshtein.distance(user_agent, x.user_agent) < Levenshtein.distance(user_agent, y.user_agent) and x or y, devices, ) if Levenshtein.distance(user_agent, best.user_agent) <= settings.UA_PREFIX_MATCHING_MAX_DISTANCE: return best if settings.UA_GENERIC_FALLBACK: # Try to match with generic properties # :TODO: raise NotImplemented raise NoMatch, "Can't find a match in currently installed WURFL table for user_agent `%s`" % user_agent
def _match_user_agent(cls, user_agent): device = cls.objects.filter(user_agent=user_agent).order_by('-actual_device_root')[:1] if len(device): return device[0] else: if settings.UA_PREFIX_MATCHING: #~ Try more flexible matching, from 1/3rd to 1/10th of the original UA string #~ We break out as soon as we get a match (or matches, in which case we use Levenshtein #~ distance to determine which one we want to use) or if the shortened UA string is less #~ than 5 characters long devices = None for factor in range(3,10): if len(user_agent)/factor <= 5: break devices = cls._match_partial_user_agent(user_agent,factor) if len(devices): break if len(devices): user_agent = force_unicode(user_agent) best = reduce( lambda x,y: Levenshtein.distance(user_agent, x.user_agent) < Levenshtein.distance(user_agent, y.user_agent) and x or y, devices, ) if Levenshtein.distance(user_agent, best.user_agent) <= settings.UA_PREFIX_MATCHING_MAX_DISTANCE: return best if settings.UA_GENERIC_FALLBACK: # Try to match with generic properties # :TODO: raise NotImplementedError, 'Generic properties matching is not implemented' raise NoMatch, "Can't find a match in currently installed WURFL table for user_agent `%s`" % user_agent
def get_closest_match(self, cells, matching_threshold, suppress_non_answer_cells=False): """ Returns a list of cells that most closely match the question prompt. If no match is better than the matching_threshold, the empty list will be returned. """ return_value = [] distances = [Levenshtein.distance(self.start_md, u''.join(cell['source'])) for cell in cells] if min(distances) > matching_threshold: return return_value best_match = argmin(distances) if self.stop_md == u"next_cell": end_offset = 2 elif len(self.stop_md) == 0: end_offset = len(cells) - best_match else: distances = [Levenshtein.distance(self.stop_md, u''.join(cell['source'])) for cell in cells[best_match:]] if min(distances) > matching_threshold: return return_value end_offset = argmin(distances) if len(self.question_heading) != 0 and not suppress_non_answer_cells: return_value.append(NotebookExtractor.markdown_heading_cell(self.question_heading, 2)) if not suppress_non_answer_cells: return_value.append(cells[best_match]) return_value.extend(cells[best_match + 1:best_match + end_offset]) return return_value
def write_lex_stats(b, num, syll = None): """Use Levenshtein package to calcualte lev and count up mps, neighbors, etc""" total = 0. mps = 0 neighbors = 0 homophones = 0 lev_total = 0 for item in itertools.combinations(b, 2): if syll != None: #if len(item[0].split("-"))==syll or len(item[1].split("-"))==syll: lev = Levenshtein.distance(re.sub("-", "", item[0]), re.sub("-", "", item[1])) if lev == 0: homophones += 1 elif lev == 1: neighbors += 1 if len(re.sub("-", "", item[0])) == len(re.sub("-", "", item[1])): mps += 1 total += 1 lev_total += lev else: lev = Levenshtein.distance(re.sub("-", "", item[0]), re.sub("-", "", item[1])) if lev == 0: homophones += 1 elif lev == 1: neighbors += 1 if len(re.sub("-", "", item[0])) == len(re.sub("-", "", item[1])): mps += 1 total += 1 lev_total += lev print str(num) f.write(",".join([str(x) for x in [num, homophones, mps, neighbors, lev_total/total, len(b)] ]) + "\n") return
def strip_bogus_lines(arg_lines): re_blank = re.compile(r'^\s$') re_page_left = re.compile(r'^\s*Page [0-9]+') re_page_right = re.compile(r'Page [0-9]+\s*$') berk_string = 'BERKELEY TRAINING ASSOCIATES © 2009\n' mft_string = 'MFT PRACTICE EXAMINATIONS' lines = [] for line in arg_lines: bogosity = 0.0 if re_blank.search(line): bogosity += 1.0 if re_page_left.search(line): bogosity += 0.5 if re_page_right.search(line): bogosity += 0.5 l = Levenshtein.distance(line[-(len(mft_string)):], mft_string) if l < 5: bogosity += (5 - l) / 5.0 l = Levenshtein.distance(line[:len(mft_string)], mft_string) if l < 5: bogosity += (5 - l) / 5.0 l = Levenshtein.distance(line, berk_string) if l < 5: bogosity += (5 - l) / 5.0 if bogosity < 0.25: lines.append(line) return lines
def compare_list(self, company_name, table_name): iac_hz_list = [ [0, 1, 2, 3, 4, 5, 6] ] iac_guanwang_list = [ [1, 1, 2, 3, 4, 5, 6] ] hz_list = [x.replace("\n", "").replace("\r", "") for x in iac_hz_list] guanwang_list = [x.replace("\n", "").replace("\r", "") for x in iac_guanwang_list] if iac_hz_list is None: print "haizhi system lack %s's %s." % (company_name, table_name) return if iac_guanwang_list is None: print "guanwang lack %s's %s." % (company_name, table_name) return if len(iac_hz_list) == len(iac_guanwang_list): print "the length of list %s is same : d%" % (table_name, len(iac_hz_list)) else: print "the length of list %s is different : d%,d%" % (table_name, len(iac_hz_list), len(iac_guanwang_list)) # 相同的元素数量 num = 0 for row in hz_list: if row in guanwang_list: # guanwang_list.remove(row) # hz_list.remove(row) num += 1 continue else: for row2 in guanwang_list: Levenshtein.distance(row, row2)
def byLevenshtein(key, result_yield): lang = "zha" try: str(key).encode('iso-8859-1') except UnicodeEncodeError: lang = "zh" result_list2d = [] if lang == "zha": for i in result_yield: result_list2d.append([Levenshtein.distance(key, i[0]), i]) else: for i in result_yield: for j in i[1]: list_tmp = split("[\[\]\(\)\ \;\,\。\,\.]", j) list_distance = [] if len(list_tmp) == 2: # 2 means this entry contains only 1 word distance = Levenshtein.distance(key, list_tmp[1]) if distance == 0: list_distance.append(-1) # -1 means the best matched one else: list_distance.append(distance) continue for tmp in list_tmp: if key in tmp: list_distance.append(Levenshtein.distance(key, tmp)) result_list2d.append([min(list_distance), i]) # The method above is not so accurate,but it might work better than the previous one result_list2d.sort() for i in result_list2d: yield i[1]
def read_type(left_read, right_read, left_enzsite, right_enzsite, left_bc, right_bc): """Determine if bisulfite read is watson or crick""" lr_enz_left = left_read[1][len(left_bc):len(left_bc)+5] rr_enz_right = right_read[1][len(right_bc):len(right_bc)+5] if left_enzsite == 'TACAA' and right_enzsite == 'TGCAG': return 'crick' elif right_enzsite == 'TACAA' and left_enzsite == 'TGCAG': return 'watson' elif right_enzsite == 'TGCAG' and left_enzsite == 'TGCAG': return 'gbs' else: #enzyme sites have not been establshed correctly, establish read #type based on closest matching enz site and CG count. watson_count = left_read[1].count('G') + right_read[1].count('C') +0.001 crick_count = left_read[1].count('C') + right_read[1].count('G') +0.001 left_distance = Levenshtein.distance(lr_enz_left, left_enzsite) right_distance = Levenshtein.distance(rr_enz_right, right_enzsite) if left_distance < right_distance: #left enz_site should be leading since it has fewer mismatches. if left_enzsite == 'TACAA' and crick_count/float(watson_count)>2: return 'crick' else: return 'nodet' else: if left_enzsite == 'TGCAG' and watson_count/float(crick_count)>2: return 'watson' else: return 'nodet'
def splitted_word_distance( pattern, text ): #remove trailing chars... words = text.split() lp = len(pattern) lw = len(words) if lp < lw: d = len(''.join(words[lp:])) n = lp else: if re_type in map( lambda e: type(e), pattern[lw:] ): return float("inf") d = len(''.join( pattern[lw:] )) n = lw d0 = [ d ] for i in range(n): word = words[i] p = pattern[i] if type( p ) == str: d += levenshtein.distance( p, word ) d0.append( levenshtein.distance( p, word ) ) elif type( p ) == re_type: if not p.fullmatch( word ): return float("inf") else: raise Exception( "Pattern has wrong type %s" % (str(type(pattern))) ) return d
def find_similar(word): word_sound = fuzzy.nysiis(word) best = None best_dist = 99999 for w in wordlist: if pylev.distance(word_sound, w[1]) < best_dist and word != w[0][:-1]: best_dist = pylev.distance(word_sound, w[1]) best = (w[0], best_dist) return best
def testLev(self): s1 = "hello" s2 = "dog" s3 = "frog" s4 = "log" self.assertEqual(Lev.distance(s1,s2), 5) self.assertEqual(Lev.distance(s2,s3), 2) self.assertEqual(Lev.distance(s3,s4), 2) self.assertEqual(Lev.distance(s1,s1), 0)
def is_tRNA(self, seq): """Takes a sequence and determines whether or not it matches the criterion for being a tRNA """ length = len(seq) sub_size = 24 t_loop_error = True acceptor_error = True cur_seq_specs = SeqSpecs() # Start the sliding window at the last 24 bases, and move to the left # one at a time for i in range(length - sub_size + 1): sub_str = seq[-(i + sub_size):(length - i)] t_loop_seq = sub_str[0:9] acceptor_seq = sub_str[-3:] t_loop_dist = (lev.distance("GTTC", sub_str[0:4]) + lev.distance("C", sub_str[8])) acceptor_dist = lev.distance("CCA", sub_str[-3:]) mis_count = t_loop_dist + acceptor_dist if t_loop_dist < 1: t_loop_error = False else: t_loop_error = True if acceptor_dist < 1: acceptor_error = False else: acceptor_error = True if mis_count < cur_seq_specs.mis_count: cur_seq_specs.length = length cur_seq_specs.mis_count = mis_count cur_seq_specs.t_loop_error = t_loop_error cur_seq_specs.acceptor_error = acceptor_error cur_seq_specs.seq = seq cur_seq_specs.seq_sub = sub_str cur_seq_specs.t_loop_seq = t_loop_seq cur_seq_specs.acceptor_seq = acceptor_seq if mis_count < 2: cur_seq_specs = self.handle_pass_seq(cur_seq_specs, i) res_tup = (True, cur_seq_specs) return res_tup # Handles a failed sequence if cur_seq_specs.t_loop_error and cur_seq_specs.acceptor_error: if length < 24: self.stats_dict['short_rejected'] += 1 else: self.stats_dict['both_rejected'] += 1 elif cur_seq_specs.acceptor_error and not cur_seq_specs.t_loop_error: self.stats_dict['acceptor_seq_rejected'] += 1 elif cur_seq_specs.t_loop_error and not cur_seq_specs.acceptor_error: self.stats_dict['t_loop_seq_rejected'] += 1 self.stats_dict['total_rejected'] += 1 res_tup = (False, cur_seq_specs) return res_tup
def nameSort( key, character ): length = Levenshtein.distance(key, character['name']) if character.get('real_name'): n = Levenshtein.distance(key, character['real_name']) length = n if n < length else length if character.get('aliases'): for alias in character['aliases'].split(): n = Levenshtein.distance(key, alias) length = n if n < length else length return length
def check_search_terms(self, search_term): global search_term_relevance_list distance = 1000 relevance = 1 for terms in search_term_relevance_list: if Levenshtein.distance(search_term, terms.search_term) < distance: distance = Levenshtein.distance(search_term, terms.search_term) relevance = terms.relevance return relevance
def identify_primer_with_mismatches(seq, fw, rev, max_mismatch=8): for primer in fw: d = Levenshtein.distance(seq[: len(primer)], primer) if d < max_mismatch: return +1, primer for primer in rev: d = Levenshtein.distance(seq[: len(primer)], primer) if d < max_mismatch: return -1, primer return None, None
def writeLevenshteinDistance(self, termIndex, dictionary, dictionaryLen, levThr, corpus_writer): term, termLen = dictionary[termIndex] #termLen = len(term) # Compute candidates for levenshtein distance of term from termToRowIndex. #candidates = [] candidates = [None] * dictionaryLen i = 0 for candidateId, (candidate, candidateLen) in dictionary.iteritems(): #candidate, candidateLen = dictionary[candidateId] # Add candidate if the difference between termLen and candidateLen is less or equal than 0.5*maxlen. if termLen >= candidateLen: diff = termLen - candidateLen if diff <= 0.5*termLen: candidates[i] = (candidateId, candidate, candidateLen) i += 1 else: diff = candidateLen - termLen if diff <= 0.5*candidateLen: candidates[i] = (candidateId, candidate, candidateLen) i += 1 #Grab the sublist excluding the preallocated values candidates = candidates[0:i] #if 2*candidateLen >= termLen: # candidates.append((candidateId, candidate, candidateLen)) #elif 2*termLen >= candidateLen: # candidates.append((candidateId, candidate, candidateLen)) #candidates = [(candidateId, dictionary[candidateId]) for candidateId in dictionary.keys() if 2*len(dictionary[candidateId]) >= termLen] #candidates += [(candidateId, dictionary[candidateId]) for candidateId in dictionary.keys() if 2*termLen >= len(dictionary[candidateId])] #candidates = set(candidates) # Compute the values #sims = [] sims = [None] * dictionaryLen i = 0 for candidateId, candidate, candidateLen in candidates: sim = 0.0 #Compute distance between candidate and term if termLen >= candidateLen: #Split on len to prevent the use of max, which is awfully slow. sim = 1.0 - float(Levenshtein.distance(candidate, term))/termLen #Compute Levensthein distance else: sim = 1.0 - float(Levenshtein.distance(candidate, term))/candidateLen #Compute Levensthein distance if sim >= levThr: sims[i] = (candidateId, sim) i += 1 #Grab the sublist excluding the preallocated values sims = sims[0:i] max_id, veclen = corpus_writer.write_vector(termIndex, sims) return veclen
def get_edit_dist(input_file, output_file, target_seq): fi = open (input_file,'r') fo = open (output_file, 'w') fo.write("Chr\tLocation\tForward29\tReverse29\tEdit dist for\tEdit dist rev\tStep for\tStep rev\tDeletion# for\tDeletion# rev\tBulge dist for\tBulge dist rev\n") for line in fi.xreadlines(): units = line.split() chrm = units[0] loc = units[1] forseq = units[2] revseq = units[3] value = {} for nuc in "ATGC": target_seq.replace('N',nuc) value[nuc] = (l.distance(target_seq,forseq),l.editops(target_seq,forseq)) for_max = max(value, key=value.get) for_dist, for_editops = value[for_max] for_step=[] for each in for_editops: a,b,c =each for_step.append(a) for_deletion=for_step.count('delete') value = {} for nuc in "ATGC": target_seq.replace('N',nuc) value[nuc] = (l.distance(target_seq,revseq),l.editops(target_seq,revseq)) rev_max = max(value, key=value.get) rev_dist, rev_editops = value[rev_max] rev_step=[] for each in rev_editops: a,b,c=each rev_step.append(a) rev_deletion=rev_step.count('delete') #print int(rev_deletion) bulge_l = [for_dist+for_deletion*2,rev_dist+rev_deletion*2] del_l = [for_deletion,rev_deletion] fo.write( "%s\t%s\t\ %s\t%s\t\ %d\t%d\t\ %s\t%s\t\ %d\t%d\t\ %d\t%d\t\ %d\t%d\n"% (chrm,loc, forseq,revseq, for_dist,rev_dist, for_editops,rev_editops, for_deletion,rev_deletion, bulge_l[0],bulge_l[1], min(bulge_l),del_l[bulge_l.index(min(bulge_l))])) #print rev_dist+rev_deletion*4 fi.close();fo.close()
def get_closest_distances(flu_strings, host_strings): """For each flu sequence, find the closet distance to a human string""" min_distances = [] for flu_s in flu_strings: dis = Levenshtein.distance(flu_s, host_strings[0]) for host_s in host_strings[1:]: d = Levenshtein.distance(flu_s, host_s) dis = min([dis, d]) print dis
def match_something(item, list): item = item.replace(" ","") item = item.replace(".", "") item = item.replace(",", "") lowest = list[0] lowestdelta = Levenshtein.distance(item, list[0]) for entry in list: delta = Levenshtein.distance(item, entry) if delta < lowestdelta: lowestdelta = delta lowest = entry print(delta, item, entry) return lowest
def soundex_distance(ovv_snd,cand): try: lev = Levenshtein.distance(unicode(ovv_snd),soundex.soundex(cand.decode("utf-8","ignore"))) except UnicodeEncodeError: print('UnicodeEncodeError[ovv_snd]: %s %s' % (ovv_snd,cand)) lev = Levenshtein.distance(ovv_snd,soundex.soundex(cand.encode("ascii","ignore"))) except UnicodeDecodeError: print('UnicodeDecodeError[ovv_snd]: %s %s' % (ovv_snd,cand)) lev = Levenshtein.distance(ovv_snd,soundex.soundex(cand.decode("ascii","ignore"))) except TypeError: print ('TypeError[ovv_snd]: %s %s' % (ovv_snd,cand)) lev = 10. snd_dis = lev return snd_dis
def checkDifferentDescriptions (clientinfo, serverinfo): try: maxDistance = sys.argv[1] except: maxDistance = 5 if clientinfo['name'] != serverinfo['name'] and Levenshtein.distance(clientinfo['name'], serverinfo['name']) <= maxDistance: output(clientinfo, serverinfo) elif clientinfo['name'] != serverinfo['name'] and clientinfo['name'].lower() == serverinfo['name'].lower(): output(clientinfo, serverinfo) elif clientinfo['desc'] != serverinfo['desc'] and Levenshtein.distance(clientinfo['desc'], serverinfo['desc']) <= maxDistance: output(clientinfo, serverinfo) elif clientinfo['desc'] != serverinfo['desc'] and clientinfo['desc'].lower() == serverinfo['desc'].lower(): output(clientinfo, serverinfo)
def test(clf): dvds = [] with open("dvd.csv") as f: for i, j in enumerate(f): dvds.append(j) movies = [] with open("movies.csv") as f: for i, j in enumerate(f): movies.append(j) dvds = [dvd for dvd in dvds if dvd > "B"] movies = [movie for movie in movies if movie > "B"] print(len(dvds), len(movies)) with open("test.csv", "w") as f: i = 0 for dvd in dvds: prefix = dvd[0] i += 1 maxSimil = 0.0 for movie in movies: if movie[0] == prefix: tempSim = lev.jaro(dvd, movie) if tempSim > maxSimil: maxSimil = tempSim maxMovie = movie temp = [ 1.0 - (lev.distance(dvd, maxMovie) / len(dvd)), lev.jaro(dvd, maxMovie), lev.jaro_winkler(dvd, maxMovie), lev.ratio(dvd, maxMovie), ] print("%s\t%s\t%f\t%f" % (dvd.rstrip(), maxMovie.rstrip(), clf.decision_function(temp), clf.predict(temp))) f.write( "%s\t%s\t%f\t%f\t%f\t%f\t%f\t%i\n" % ( dvd.rstrip(), maxMovie.rstrip(), 1.0 - (lev.distance(dvd, maxMovie) / len(dvd)), lev.jaro(dvd, maxMovie), lev.jaro_winkler(dvd, maxMovie), lev.ratio(dvd, maxMovie), clf.decision_function(temp), clf.predict(temp), ) )
def filter_hits_by_distance(hits, source_text, min_similarity=DEFAULT_MIN_SIMILARITY): """Returns ES `hits` filtered according to their Levenshtein distance to the `source_text`. Any hits with a similarity value (0..1) lower than `min_similarity` will be discarded. It's assumed that `hits` is already sorted from higher to lower score. """ if min_similarity <= 0 or min_similarity >= 1: min_similarity = DEFAULT_MIN_SIMILARITY filtered_hits = [] for hit in hits: hit_source_text = hit['_source']['source'] distance = Levenshtein.distance(source_text, hit_source_text) similarity = ( 1 - distance / float(max(len(source_text), len(hit_source_text))) ) logger.debug( 'Similarity: %.2f (distance: %d)\nOriginal:\t%s\nComparing with:\t%s', similarity, distance, source_text, hit_source_text ) if similarity < min_similarity: break filtered_hits.append(hit) return filtered_hits
def __searchNbestNodeMatchRestricted__(self, segm_s, segm_best, valid_nodes, is_prefix): # TODO: segm = " ".join(segm_s).strip() n = len(segm) # max_lsc = float("-inf") # max_node = None nbest_nodes = [] for n_idx in valid_nodes: covered_sent = self.__nodes__[n_idx].getCoveredString().strip() if not is_prefix or (is_prefix and covered_sent[0:3] == "<s>"): covered_sent = covered_sent.replace("|UNK|UNK|UNK","").replace("<s>","").replace("</s>","").strip() d = Levenshtein.distance(segm,covered_sent) d = min(d,n) err_lsc = d*log(self.err_p) + (n-d)*log(1.0-self.err_p) + log(fact(n))-(log(fact(d))+log(fact(n-d))) itp_lsc = self.__nodes__[n_idx].getInsideLogScore()+self.__nodes__[n_idx].getOutsideLogScore() cur_lsc = itp_lsc+self.err_w*err_lsc #inside x outside x err**err_w # if cur_lsc > max_lsc: # max_lsc = cur_lsc # max_itp_lsc = itp_lsc # max_node = n_idx if len(nbest_nodes)<=segm_best or cur_lsc>nbest_nodes[0][0]: nbest_nodes.append((cur_lsc,itp_lsc,n_idx)) nbest_nodes = sorted(nbest_nodes)[-segm_best:] #print nbest_nodes #return max_lsc,max_itp_lsc,max_node still_more_options = True if len(nbest_nodes)<segm_best: still_more_options = False return nbest_nodes[0],still_more_options
def clean(self): """ Validates that old and new password are not too similar. """ cleaned_data = super(PasswordPoliciesChangeForm, self).clean() old_password = cleaned_data.get("old_password") new_password1 = cleaned_data.get("new_password1") if old_password and new_password1: if old_password == new_password1 and \ not settings.PASSWORD_USE_HISTORY: raise forms.ValidationError( self.error_messages['password_identical']) else: if settings.PASSWORD_DIFFERENCE_DISTANCE: try: import Levenshtein except ImportError: pass else: distance = Levenshtein.distance(old_password, new_password1) if distance < settings.PASSWORD_DIFFERENCE_DISTANCE: raise forms.ValidationError( self.error_messages['password_similar']) return cleaned_data
def __searchBestNodeMatch__(self, pref_s): if len(pref_s)==0: node=self.__nodes__[self.__init_node__] ec_lsc = node.getInsideLogScore()+node.getOutsideLogScore() return self.__init_node__,ec_lsc,ec_lsc pref = " ".join(pref_s).strip() n = len(pref) max_lsc = float("-inf") max_node = None ordered_keys = sorted(self.__nodes__) #inside x outside x err for n_idx in ordered_keys: covered_sent = self.__nodes__[n_idx].getCoveredString().strip() if covered_sent[0:3] == "<s>": #consider only nodes covering a prefix covered_sent = covered_sent.replace("|UNK|UNK|UNK","").replace("<s>","").replace("</s>","").strip() d = Levenshtein.distance(pref,covered_sent) d = min(d,n) err_lsc = d*log(self.err_p) + (n-d)*log(1.0-self.err_p) + log(fact(n))-(log(fact(d))+log(fact(n-d))) itp_lsc = self.__nodes__[n_idx].getInsideLogScore()+self.__nodes__[n_idx].getOutsideLogScore() cur_lsc = itp_lsc+self.err_w*err_lsc if cur_lsc > max_lsc: max_lsc = cur_lsc max_itp_lsc = itp_lsc max_node = n_idx # print "\n-----------------------" # print pref,"#",covered_sent,"#",n,"->",d # print max_lsc, max_itp_lsc, err_lsc # print self.__nodes__[max_node] # print "-----------------------\n" # else: # print " -->","#"+covered_sent+"#",d,cur_lsc,itp_lsc,err_lsc return max_node,max_lsc,max_itp_lsc
def levenshtein(string1, string2): """ Computes the Levenshtein distance between two strings. Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string is carried out using a sequence of the following operators: delete a character, insert a character, and substitute one character for another. Args: string1,string2 (str): Input strings Returns: Levenshtein distance (int) Raises: TypeError : If the inputs are not strings Examples: >>> levenshtein('a', '') 1 >>> levenshtein('example', 'samples') 3 >>> levenshtein('levenshtein', 'frankenstein') 6 Note: This implementation internally uses python-levenshtein package to compute the Levenshtein distance """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # using Levenshtein library return Levenshtein.distance(string1, string2)
def norm_levenshtein(str1, str2): max_len = float(max([len(str1), len(str2)])) try: return Levenshtein.distance(str1, str2) / max_len except ZeroDivisionError: # both sections are instrumental return 0
# -*- coding: utf-8 -*- from bs4 import BeautifulSoup import re import requests import json import os import csv import Levenshtein from args import get_name_similair_args from tqdm import tqdm ''' Due to there are name abbrevation or middle name in English e.g.Romit Jitendra Shah & Romit J. Shah, Daniel Durn, Daniel J. Durn so caculate the similar ratio to decide whether two different name is same four types of number of similar raito Levenshtein.distance(s1, s2) Compute absolute Levenshtein distance of two strings. Levenshtein.ratio(s1, s2) The similarity is a number between 0 and 1, it's usually equal or somewhat higher than difflib.SequenceMatcher.ratio(), because it's based on real minimal edit distance. Levenshtein.jaro(s1, s2) The Jaro-Winkler string similarity metric is a modification of Jaro metric giving more weight to common prefix, as spelling mistakes are more likely to occur near ends of words. The prefix weight is inverse value of common prefix length sufficient to consider the strings *identical*. If no prefix weight is specified, 1/10 is used. Levenshtein.jaro_winkler(s1, s2) The result is a list of triples with the same meaning as in SequenceMatcher's get_matching_blocks() output. It can be used with both editops and opcodes. The second and third arguments don't have to be actually output result csv in similar result in folder usge : python name_similar_ratio.py --company='AMAT' --token='yourtoken' ''' def folder_builder(outputfolder): if not os.path.isdir(outputfolder): os.mkdir(outputfolder)
def eval_evt(etdata_gt, etdata_pr, n_events): t = time.time() if etdata_gt.evt is None: etdata_gt.calc_evt(fast=True) if etdata_pr.evt is None: etdata_pr.calc_evt(fast=True) #levenshtein distance evt_gt = etdata_gt.evt['evt'] evt_gt = evt_gt[~(evt_gt==0)] evt_pr = etdata_pr.evt['evt'] evt_pr = evt_pr[~(evt_pr==0)] wer = Lev.distance(''.join(map(str, evt_gt)), ''.join(map(str, evt_pr)))/\ float(len(evt_gt)) _cer = map(lambda _a, _b: Lev.distance(_a, _b), ''.join(map(str, etdata_gt.data['evt'])).split('0'), ''.join(map(str, etdata_pr.data['evt'])).split('0')) mask=etdata_gt.data['evt']==0 evt_len = float(sum(~mask)) cer = sum(_cer)/evt_len #sample level K t = time.time() evts_gt_oh = convertToOneHot(etdata_gt.data['evt'], n_events) evts_pr_oh = convertToOneHot(etdata_pr.data['evt'], n_events) ks = [calc_k(evts_gt_oh[:,i], evts_pr_oh[:,i]) for i in range(1, n_events)] evt_gt = etdata_gt.data['evt'] evt_gt = evt_gt[~(evt_gt==0)] evt_pr = etdata_pr.data['evt'] evt_pr = evt_pr[~(evt_pr==0)] ks_all = metrics.cohen_kappa_score(evt_gt, evt_pr) ks.extend([ks_all]) #event level K and F1 try: t = time.time() ke_ = [] f1e_ = [] for evt in range(1, 4): #evt=1 _etdata_gt = copy.deepcopy(etdata_gt) mask_ext = _etdata_gt.data['evt']==0 mask = _etdata_gt.data['evt']==evt _etdata_gt.data['evt'][mask]=1 _etdata_gt.data['evt'][~mask]=0 _etdata_gt.data['evt'][mask_ext]=255 _etdata_gt.calc_evt(fast=True) _etdata_pr = copy.deepcopy(etdata_pr) mask_ext = _etdata_pr.data['evt']==0 mask = _etdata_pr.data['evt']==evt _etdata_pr.data['evt'][mask]=1 _etdata_pr.data['evt'][~mask]=0 _etdata_pr.data['evt'][mask_ext]=255 _etdata_pr.calc_evt(fast=True) evt_overlap, evt_gt, evt_pr = calc_KE(_etdata_gt, _etdata_pr) mask = (evt_gt==255) & (evt_pr==255) evt_gt = evt_gt[~mask] evt_pr = evt_pr[~mask] ke_.append(calc_k(evt_gt, evt_pr)) f1e_.append(calc_f1(evt_gt, evt_pr)) evt_overlap, evt_gt, evt_pr = calc_KE(etdata_gt, etdata_pr) mask = (evt_gt==0) & (evt_pr==0) evt_gt = evt_gt[~mask] evt_pr = evt_pr[~mask] #print ('[overlap], dur %.2f' % (time.time()-t)) evt_gt_oh = convertToOneHot(evt_gt, n_events) evt_pr_oh = convertToOneHot(evt_pr, n_events) ke = [calc_k(evt_gt_oh[:,i], evt_pr_oh[:,i]) for i in range(1, n_events)] f1e = [calc_f1(evt_gt_oh[:,i], evt_pr_oh[:,i]) for i in range(1, n_events)] ke_all = metrics.cohen_kappa_score(evt_gt, evt_pr) f1_all = metrics.f1_score(evt_gt, evt_pr, average='weighted') ke.extend([ke_all]) ke_.extend([ke_all]) f1e.extend([f1_all]) f1e_.extend([f1_all]) #print ('[KE], dur %.2f' % (time.time()-t)) except: #TODO: Debug print ("Could not calculate event level k") ks = [0.,]*(n_events+1) ke = [0.,]*(n_events+1) f1e = [0.,]*(n_events+1) return wer, cer, ke_, ks, f1e_, (evt_overlap, evt_gt, evt_pr)
def word_distance(s, t): s = s.lower().strip() t = t.lower().strip() d = Levenshtein.distance(s, t) return d
def calcDist(line): fileName = line.strip() dist = Levenshtein.distance(fileName, askingFor) return (dist, fileName)
def do_logic(file_content): for input_line in file_content: string1, string2 = input_line.split(',') diff = Levenshtein.distance(string1, string2) if OUTPUT: print('%s %s %d' % (string1, string2, diff))
def get_org_infor(org1, org2): p_org1 = preprocessorg(org1) p_org2 = preprocessorg(org2) return (len(p_org1), len(p_org2), lv.distance(p_org1, p_org2))
def levenshtein(a, b): return -Levenshtein.distance(a, b)
'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'I', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'take', 'people', 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us' ]) print "calculating distances..." (dim, ) = words.shape f = lambda (x, y): -leven.distance(x, y) res = np.fromiter(itertools.imap(f, itertools.product(words, words)), dtype=np.uint8) A = np.reshape(res, (dim, dim)) af = AffinityPropagation().fit(A) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ unique_labels = set(labels) for i in unique_labels: print words[labels == i]
forms_set = set() with open('./polimorfologik-2.1/polimorfologik-2.1.txt', 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter=';') for row in reader: forms_set.add(row[1].lower()) not_existing_words = [] for word in data_counter.most_common(): if word[0] not in forms_set: not_existing_words.append(word) pprint(not_existing_words[:30]) triple_missing = [x for x in not_existing_words if x[1] == 3] pprint(triple_missing[:30]) for word in triple_missing[:30]: corect = [] for form in forms_set: lev_dist = Levenshtein.distance(word[0], form) if lev_dist < 4: corect.append((form, lev_dist)) corect.sort(key=lambda x: x[1]) print(word) pprint(corect[:3])
def fit(model, epochs, train_data_loader, valid_data_loader): best_leven = 1000 optimizer = optim.AdamW(model.parameters(), 5e-4) len_train = len(train_data_loader) loss_func = nn.CTCLoss(blank=len(classes)).to(dev) for i in range(1, epochs + 1): # ============================================ TRAINING ======================================================== batch_n = 1 train_levenshtein = 0 len_levenshtein = 0 for spectrograms, labels, input_lengths, label_lengths in tqdm( train_data_loader, position=0, leave=True, file=sys.stdout, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)): model.train() spectrograms, labels = spectrograms.to(dev), labels.to(dev) optimizer.zero_grad() loss_func( model(spectrograms).log_softmax(2).permute(1, 0, 2), labels, input_lengths, label_lengths).backward() optimizer.step() # ================================== TRAINING LEVENSHTEIN DISTANCE ========================================= if batch_n > (len_train - 5): model.eval() with torch.no_grad(): decoded = model.beam_search_with_lm(spectrograms) for j in range(0, len(decoded)): actual = num_to_str(labels.cpu().numpy()[j] [:label_lengths[j]].tolist()) train_levenshtein += leven.distance(decoded[j], actual) len_levenshtein += label_lengths[j] batch_n += 1 # ============================================ VALIDATION ====================================================== model.eval() with torch.no_grad(): val_levenshtein = 0 target_lengths = 0 for spectrograms, labels, input_lengths, label_lengths in tqdm( valid_data_loader, position=0, leave=True, file=sys.stdout, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.BLUE, Fore.RESET)): spectrograms, labels = spectrograms.to(dev), labels.to(dev) decoded = model.beam_search_with_lm(spectrograms) for j in range(0, len(decoded)): actual = num_to_str( labels.cpu().numpy()[j][:label_lengths[j]].tolist()) val_levenshtein += leven.distance(decoded[j], actual) target_lengths += label_lengths[j] print('Epoch {}: Training Levenshtein {} | Validation Levenshtein {}'. format(i, train_levenshtein / len_levenshtein, val_levenshtein / target_lengths), end='\n') # ============================================ SAVE MODEL ====================================================== if (val_levenshtein / target_lengths) < best_leven: torch.save(model.state_dict(), f=str((val_levenshtein / target_lengths) * 100).replace( '.', '_') + '_' + 'model.pth') best_leven = val_levenshtein / target_lengths
import pandas as pd import numpy as np import Levenshtein import random random.seed(12345) d = pd.read_csv("../data/asjp19wide.csv", index_col=0) words = d.values[~d.isnull()] words = np.concatenate([w.split('-') for w in words]) tests = pd.DataFrame(columns=['word1', 'word2', 'LD']) for i in range(1000): if i % 100 == 0: print(i) w1, w2 = random.sample(list(words), 2) tests.loc[i] = [w1, w2, Levenshtein.distance(w1, w2)] tests.to_csv('levenshteinTests.csv', index=False)
def eval_ctc(model, dataloader, idx2char): model.eval() t = time.time() total_dist = 0 total_line_err = 0 total_ratio = 0 total_pred_char = 1 total_label_char = 0 total_samples = 0 total_ned = 0 for j, batch in enumerate(dataloader): imgs = batch[0].cuda() labels_length = batch[1].cuda() labels_str = batch[2] with torch.no_grad(): outputs_ctc, _ = model(imgs) # outputs_att = decoder(sqs, label_att) prob = outputs_ctc.softmax(dim=2).cpu().numpy() pred = prob.argmax(axis=2) for k in range(pred.shape[1]): pred_str = "" prev = " " for t in pred[:,k]: if idx2char[t] != prev: pred_str += idx2char[t] prev = idx2char[t] pred_str = pred_str.strip() pred_str = pred_str.replace('-', '') dist = Levenshtein.distance(pred_str, labels_str[k]) total_dist += dist ratio = Levenshtein.ratio(pred_str, labels_str[k]) total_ratio += ratio total_ned += float(dist) / max(len(pred_str), len(labels_str[k])) total_pred_char += len(pred_str) total_label_char += len(labels_str[k]) total_samples += 1 if dist != 0: total_line_err += 1 print('pred: ', pred_str) print('label:', labels_str[k]) precision = 1.0 - float(total_dist) / total_pred_char recall = 1.0 - float(total_dist) / total_label_char ave_Levenshtein_ratio = float(total_ratio) / total_samples line_acc = 1.0 - float(total_line_err) / total_samples rec_score = 1.0 - total_ned / total_samples print("precision: %f" % precision) print("recall: %f" % recall) print("ave_Levenshtein_ratio: %f" % ave_Levenshtein_ratio) print("line_acc: %f" % line_acc) print("rec_score: %f" % rec_score) return line_acc, rec_score
import synonyms fns = sys.argv[1:] errors = 0 warns = 0 for fn in fns: with open(fn, mode="r") as f: vegetables = f.read().strip().split('\n') for _seq1, _seq2 in combinations(enumerate(vegetables, 1), 2): idx1, seq1 = _seq1 idx2, seq2 = _seq2 min_len = min(len(seq1), len(seq2)) dist = Levenshtein.distance(seq1, seq2) if dist < 2 and min_len > 2: errors += 1 sys.stderr.write("\n".join([ f"{fn}:{idx1}:{idx2}: ERROR: Duplicate sentences", seq1, seq2, f"(distance={dist})", "" ])) elif dist < 6: sim = synonyms.compare(seq1, seq2) if sim > 0.9: warns += 1 sys.stderr.write("\n".join([ f"{fn}:{idx1}:{idx2}: WARNING: Possible duplicate sentences", seq1, seq2, f"(distance={dist}, similarity={sim})", "" ]))
def attack(self, epsilon, alpha, attack_type="FGSM", PGD_round=40): print("Start attack") data, target = self.sound.to(self.device), self.target.to(self.device) data_raw = data.clone().detach() # initial prediction spec = torch_spectrogram(data, self.torch_stft) input_sizes = torch.IntTensor([spec.size(3)]).int() out, output_sizes = self.model(spec, input_sizes) decoded_output, decoded_offsets = self.decoder.decode( out, output_sizes) original_output = decoded_output[0][0] print(f"Original prediction: {decoded_output[0][0]}") # ATTACK ############ ATTACK GENERATION ############## if attack_type == "FGSM": data.requires_grad = True spec = torch_spectrogram(data, self.torch_stft) input_sizes = torch.IntTensor([spec.size(3)]).int() out, output_sizes = self.model(spec, input_sizes) out = out.transpose(0, 1) # TxNxH out = out.log_softmax(2) loss = self.criterion(out, self.target, output_sizes, self.target_lengths) self.model.zero_grad() loss.backward() data_grad = data.grad.data perturbed_data = self.fgsm_attack(data, epsilon, data_grad) elif attack_type == "PGD": for i in range(PGD_round): print(f"PGD processing ... {i+1} / {PGD_round}", end="\r") data.requires_grad = True spec = torch_spectrogram(data, self.torch_stft) input_sizes = torch.IntTensor([spec.size(3)]).int() out, output_sizes = self.model(spec, input_sizes) out = out.transpose(0, 1) # TxNxH out = out.log_softmax(2) loss = self.criterion(out, self.target, output_sizes, self.target_lengths) self.model.zero_grad() loss.backward() data_grad = data.grad.data data = self.pgd_attack(data, data_raw, epsilon, alpha, data_grad).detach_() perturbed_data = data ############ ATTACK GENERATION ############## # prediction of adversarial sound spec = torch_spectrogram(perturbed_data, self.torch_stft) input_sizes = torch.IntTensor([spec.size(3)]).int() out, output_sizes = self.model(spec, input_sizes) decoded_output, decoded_offsets = self.decoder.decode( out, output_sizes) final_output = decoded_output[0][0] perturbed_data = perturbed_data.detach() abs_ori = 20 * np.log10( np.sqrt(np.mean(np.absolute(data_raw.cpu().numpy())**2))) abs_after = 20 * np.log10( np.sqrt(np.mean(np.absolute(perturbed_data.cpu().numpy())**2))) db_difference = abs_after - abs_ori l_distance = Levenshtein.distance(self.target_string, final_output) print(f"Max Decibel Difference: {db_difference:.4f}") print(f"Adversarial prediction: {decoded_output[0][0]}") print(f"Levenshtein Distance {l_distance}") if self.save: torchaudio.save(self.save, src=perturbed_data.cpu(), sample_rate=self.sample_rate) self.perturbed_data = perturbed_data return db_difference, l_distance, self.target_string, final_output
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Jun 15 20:38:46 2019 @author: anchor """ #文本编辑距离 import Levenshtein a = Levenshtein.distance('abhg', 'bgj') print(a) from sklearn.cluster import KMeans X = [[0.262, 4], [0.192, 4], [4.052, 1.98], [1, 19.59], [2, 3.5], [0.78, 10.6], [2, 10.5], [2.038, 7.38], [0.574, 11.6], [5, 1.06], [4.43, 4.78], [0.514, 43], [0.592, 31], [1, 16.2], [1, 7.39], [1, 95.9], [1, 23.29], [1, 12.8], [1.338, 2.6], [0.46, 19]] # Kmeans聚类 clf = KMeans(n_clusters=4) y_pred = clf.fit_predict(X) print(clf) print(y_pred) import matplotlib.pyplot as plt x = [n[0] for n in X] y = [n[1] for n in X]
early_stopping = EarlyStopping(monitor='val_loss', patience=1, verbose=1) model.fit_generator( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=5, validation_data=valid_generator.forfit(), validation_steps=len(valid_generator), callbacks=[early_stopping], verbose=2, ) pred_pl = model.predict_generator(test_generator.forfit(), steps=len(test_generator)) pred_pl = pred_pl[:, 1] pred = res * 0.8 + pred_pl * 0.2 pred = (pred >= alpha).astype('int') for i in range(len(test_data)): d = test_data[i] texta = d[0] textb = d[1] if Levenshtein.distance(texta, textb) == 0: pred[i] = 1 test_df['label'] = pred sub = test_df[['id', 'label']] sub.to_csv(("../prediction_result/result_" + datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + ".csv"), header=None, index=False)
def Edit_distance_str(str1, str2): import Levenshtein edit_distance_distance = Levenshtein.distance(str1, str2) similarity = 1-(edit_distance_distance/max(len(str1), len(str2))) return {'Distance': edit_distance_distance, 'Similarity': similarity}
[4] generate strategy [1-9] restudy test generate test [4] intervention test [4] assessment test [x] for each item, calculate Levenshtein distance from the correct answer [x] for each item, create a boolean variable for different Levenshtein cutoff thresholds (1,2,3,4) [x] for each user, calculate scores (0-10) using different Levenshtein cutoff thresholds [x] in r, plot each criteria against testScore, look for a step fn between 9 and 10 that is inclusive """ ### Calculate lev distances ### Calculate interventionStrategyLevDistance df_items['interventionStrategyLevDistance'] = df_items.apply( lambda row: lev.distance(row['itemEnglish'], row[ 'interventionStrategyUserInputRound1']), axis=1) for n in range(1, 3): colname = "interventionStrategyLevDist" + str(n) df_items[colname] = df_items.apply( lambda row: 1 if row['interventionStrategyLevDistance'] <= n else 0, axis=1) ### Calculate interventionTestLevDistance df_items['interventionTestLevDistance'] = df_items.apply( lambda row: lev.distance(row['itemEnglish'], row[ 'interventionTestUserInput']), axis=1) for n in range(1, 3):
def similarity(s,t): l_max = max(len(s),len(t)) return round(1- float(Levenshtein.distance(s, t)/float(l_max)),2)
def train_end2end(model, vocab, datasets, use_feat): print("END2END model training...") print("Features:", STF_MODEL) print("Save Model path:", STF_MODEL_PATH) print("WER path:", END2END_WER_PATH) optimizer = Adam(model.parameters(), lr=END2END_LR) loss_fn = nn.CTCLoss(zero_infinity=True) lr_scheduler = ReduceLROnPlateau(optimizer, factor=0.2, patience=4) best_wer = get_best_wer() curve = {"train": [], "val": []} current_best_wer = float("inf") trained = False # n_epochs since wer was updated since_wer_update = 0 try: for epoch in range(1, END2END_N_EPOCHS + 1): print("Epoch", epoch) for phase in ["train", "val"]: if phase == "train": model.train() # Set model to training mode else: model.eval() dataset = datasets[phase] n_batches = dataset.start_epoch() losses = [] hypes = [] gts = [] with torch.set_grad_enabled(phase == "train"): pp = ProgressPrinter(n_batches, 25 if USE_ST_FEAT else 1) for i in range(n_batches): optimizer.zero_grad() X_batch, Y_batch, Y_lens = dataset.get_batch(i) X_batch = X_batch.to(DEVICE) Y_batch = Y_batch.to(DEVICE) preds = model(X_batch).log_softmax(dim=2) T, N, V = preds.shape X_lens = torch.full(size=(N,), fill_value=T, dtype=torch.int32) loss = loss_fn(preds, Y_batch, X_lens, Y_lens) losses.append(loss.item()) if phase == "train": loss.backward() optimizer.step() out_sentences = predict_glosses(preds, decoder=None) gts += [y for y in Y_batch.view(-1).tolist() if y != 0] for sentence in out_sentences: hypes += sentence if i == 0 and SHOW_EXAMPLE: pred = " ".join(vocab.decode(out_sentences[0])) gt = Y_batch[0][:Y_lens[0]].tolist() gt = " ".join(vocab.decode(gt)) print(" ", phase, 'Ex. [' + pred + ']', '[' + gt + ']') if SHOW_PROGRESS: pp.show(i, " ") if SHOW_PROGRESS: pp.end(" ") hypes = "".join([chr(x) for x in hypes]) gts = "".join([chr(x) for x in gts]) phase_wer = Lev.distance(hypes, gts) / len(gts) * 100 if phase == "train": lr_scheduler.step(phase_wer) curve[phase].append(phase_wer) phase_loss = np.mean(losses) print(" ", phase.upper(), "WER:", phase_wer, "Loss:", phase_loss) if phase_wer < best_wer[phase]: best_wer[phase] = phase_wer save_end2end_model(model, phase, best_wer[phase]) if phase == "val": if phase_wer < current_best_wer: current_best_wer = phase_wer since_wer_update = 0 else: since_wer_update += 1 if since_wer_update >= END2END_STOP_LIMIT and not use_feat: trained = True raise KeyboardInterrupt except KeyboardInterrupt: pass if epoch >= END2END_N_EPOCHS: trained = True with open(os.path.join(VARS_DIR, "curve.pkl"), 'wb') as f: pickle.dump(curve, f) return best_wer, trained
def checkKNF_list(self, knf_list): dist = [lv.distance(self.cleaned_company_name, x) for x in knf_list] l1, l2 = zip(*sorted(zip(dist, knf_list), reverse=False)) return l1, l2
import Levenshtein import os import requests import requests_cache requests_cache.install_cache('../cache') url = 'https://adventofcode.com/' + os.path.abspath(__file__).split( '/')[-2] + '/day/' + __file__.split('.')[0] + '/input' s = requests.get(url, cookies={"session": os.environ['SESSION']}).text.strip() two = 0 three = 0 for line in s.splitlines(): twos = False threes = False for char in range(ord('a'), ord('z') + 1): if line.count(chr(char)) == 2: twos = True if line.count(chr(char)) == 3: threes = True two += int(twos) three += int(threes) for otherLine in s.splitlines(): if Levenshtein.distance(line, otherLine) == 1: print(line) print(two * three)
def leven_comparison(actual, result): actual_formatted = re.sub(r"[^\w]", "", actual) result_formatted = re.sub(r"[^\w]", "", result) if (actual_formatted == result_formatted): return 0 #strings are the same return Levenshtein.distance(actual_formatted, result_formatted)
expression=lambda word1, word2, analyzer: abs(len(word1) - len(word2)), lang={"russian", "english"}, order=5), "length difference norm by max": Feature(description=""" Модуль разности между длинами слов разделенный на максимум из длин """, expression=lambda word1, word2, analyzer: abs( len(word1) - len(word2)) / max(len(word1), len(word2)), lang={"russian", "english"}, order=6), "Levenshtein difference": Feature(description=""" Редакционное расстояние между словами """, expression=lambda word1, word2, analyzer: Levenshtein.distance( word1, word2), lang={"russian", "english"}, order=7), "Levenshtein difference (lemmas)": Feature(description=""" Редакционное расстояние между словырными формами """, expression=lambda word1, word2, analyzer: Levenshtein.distance( analyzer.parse(word1)[0].normal_form, analyzer.parse(word2)[0].normal_form), lang={"russian"}, order=8), "Levenshtein difference (lemmas) norm by length sum": Feature(description=""" Редакционное расстояние между словарными формами """,
logging.getLogger().setLevel(logging.INFO) api_key = os.getenv("ICUBAM_API_KEY") icubam_host = os.getenv("ICUBAM_HOST") france_departments = set( list(icubam.predicu.data.load_france_departments().departmentName.unique()) ) bedcounts = icubam.predicu.data.load_icubam( api_key=api_key, icubam_host=icubam_host, clean=False ) icubam_departments = set(list(bedcounts.icu_dept.unique())) suspects = icubam_departments - france_departments candidates = france_departments - icubam_departments fixes = dict() for suspect in suspects: best_candidate = None best_candidate_dist = int(1e6) for candidate in candidates: dist = levenshtein.distance(suspect, candidate) if dist < best_candidate_dist: best_candidate = candidate best_candidate_dist = dist fixes[suspect] = best_candidate print(suspect, '->', best_candidate, '( dist =', best_candidate_dist, ')') with open('icubam/predicu/data/icubam_department_typo_fixes.json', 'w') as f: json.dump(fixes, f)
def main(): rows = [] rows.append(["新編日本古典文学全集・テキスト", "KuroNet翻刻", "KuroNet翻刻(前行を含む)", "巻", "ID"]) line = "磯づたひせず とてはしたなかめりとや" # line = "とてはしたなかめりとや" # line = "く中〱なりとおほすとさま" # line = "いそづたひせずとてはしたなかめりとや" # line = "よるべなみ風のさわがす舟人も思はぬかたに磯づた" text = convert_line(line, "") print(text) map = {} map[line] = [] # print(configs) count = 0 for vol in configs: config = configs[vol] print(config) # koui = config["data"] VOL = str(vol).zfill(2) if VOL != "35" and False: continue print(VOL) path = '../../docs/iiif/kuronet/'+VOL+'.json' if not os.path.exists(path): continue with open(path) as f: df = json.load(f) members = df["selections"][0]["members"] ################## マッチング indexedObj = {} prev_line = "" for i in range(len(members)): member = members[i] prev_label = "" # -1行 if i - 1 >= 0: prev_label = members[i-1]["label"] label, text2 = convert_ocr(members[i]["label"], prev_label) print(VOL, i, text, text2) score = Levenshtein.distance(text, text2) score = score / max(len(text), len(text2)) # 正規化 obj = { "label" : label, "main" : member["label"], "score" : score, "member_id" : member["@id"], "index" : i, "vs" : "【新編】" + text + "---【OCR】" + text2, "vol" : VOL } map[line].append (obj) indexedObj[count] = obj count += 1 ################## 集計 size = len(map) count = 0 # 校異のライン毎に for line in map: count += 1 print(count, size) # print(str(koui[line])+"\t"+line) obj = map[line] # スコアが小さい順に並び替え score_sorted = sorted(obj, key=lambda x:x["score"]) flg = True for i in range(len(score_sorted)): data = score_sorted[i] if i < 25: print(data) row = [line, data["main"], data["label"], data["vol"], data["member_id"]] rows.append(row) df = pd.DataFrame(rows) df.to_excel('data/check.xlsx',index=False, header=False)
barcodes = set(barcodes) with pysam.FastxFile(args.read1) as fh, pysam.FastxFile(args.read2) as fh2: n = 0 y = 0 for record_fh, record_fh2 in zip(fh, fh2): barcode = record_fh.sequence[0:24] y += 1 barcode_list = {} for b in barcodes: if Levenshtein.distance(barcode, b) <= int(args.distance): n +=1 b = b + record_fh.sequence[24:] barcode_list[b] = Levenshtein.distance(barcode, b) else: pass if bool(barcode_list): b = min(barcode_list, key=barcode_list.get) res = True test_value = list(barcode_list.values())[0] for ele in barcode_list:
# print "-->", median, if median == last_median: break return median while True: print target = raw_input('Enter a target (ground-truth) word : ').strip() predictions = raw_input('Enter some predicted words : ').strip().split() print # get distance for each predicted word, as well as their average dist_tot = 0 for word in predictions: dist = lev.distance(target, word) print "%4i = edit_distance(%s, %s)" % (dist, target, word) dist_tot += dist dist_avg_pred = 1.*dist_tot/len(predictions) print print "Average edit_distance between each predicted word and the target word", print [target], "is:", dist_avg_pred # get the distance for the median word (to compare) weights= len(predictions) * [1,] median = get_median(predictions, weights) print "\nMedian string of", predictions, "is", [median] dist_median = 1.*lev.distance(target, median) print "Edit_distance from the median string", [median], print " to the target word ", [target], "is:", dist_median print "\n-----"
def silhouette(Dicofasta, Dicocentroid, Dicoresult, DicoNeighbour, len_max_CDR3, len_max_J, dico_vjunc): summe = 0 #for cluster in tqdm.tqdm(Dicoresult.keys()) : for cluster in Dicoresult.keys(): #print ("cluster",cluster) for seq in Dicoresult[cluster]: #print(Dicofasta[seq]) dist_intra = 0 dist_neighb = {} for seq_same_clust in Dicoresult[cluster]: if seq_same_clust != seq: V_component = 0 # same V, without considering allel if Dicofasta[seq][0].split("*")[0] != Dicofasta[ seq_same_clust][0].split("*")[0]: V_component += 1 # normalize the distance depending on the longest seq in the whole repertoire. CDR_component = float( Levenshtein.distance( Dicofasta[seq][2], Dicofasta[seq_same_clust][2])) / len_max_CDR3 J_component = (100 - get_similarity_score( Dicofasta[seq][3], Dicofasta[seq_same_clust][3], len(Dicofasta[seq][3]), len(Dicofasta[seq_same_clust][3]))) / float(len_max_J) #print ('J_component',J_component) dist_intra += CDR_component + V_component + J_component / 3.0 if len(Dicoresult[cluster]) != 1: ai = float(dist_intra) / (len(Dicoresult[cluster]) - 1) else: ai = 0 for seq_neighb in Dicoresult[DicoNeighbour[cluster]]: V_component_b = 0 CDR_component_b = 0 J_component_b = 0 # same V, without considering allel if Dicofasta[seq][0].split( "*")[0] != Dicofasta[seq_neighb][0].split("*")[0]: V_component_b += 1 CDR_component_b = Levenshtein.distance( Dicofasta[seq][1], Dicofasta[seq_neighb][1]) / float(len_max_CDR3) J_component_b = (100 - get_similarity_score( Dicofasta[seq][3], Dicofasta[seq_neighb][3], len(Dicofasta[seq][3]), len( Dicofasta[seq_neighb][3]))) / float(len_max_J) #print('V_component_b',V_component_b,'CDR_component_b',CDR_component_b,'J_component_b',J_component_b) dist_neighb[ seq_neighb] = CDR_component_b + V_component_b + J_component_b / 3.0 #print ('dist_neigh',dist_neighb) bi = min(dist_neighb.values()) #print ("================") #print (seq) #print ("ai = ",ai ) if bi < ai: #print ("lalalalaala",ai,bi) #print("disonnnnn,Dicoresult",) to_move = (list(dist_neighb.keys())[list( dist_neighb.values()).index(bi)]) #print("tomove",to_move) Dicoresult[DicoNeighbour[cluster]].remove(to_move) if len(Dicoresult[DicoNeighbour[cluster]]) == 0: del Dicoresult[DicoNeighbour[cluster]] Dicoresult[cluster].append(to_move) #print(Dicoresult) Dicocentroid = CalculateMedoid(dico_vjunc, Dicoresult) DicoNeighbour = Creat_dico_neighbour(Dicocentroid) #silhouette(Dicofasta,Dicocentroid,Dicoresult,DicoNeighbour, len_max_CDR3,len_max_J,dico_vjunc) return (Dicoresult, DicoNeighbour, Dicocentroid)
outf2 = iotools.open_file( "corrected_reads.dir/" + args.outname + "_corrected.fastq.2.gz", "w") log = iotools.open_file("corrected_reads.dir/" + args.outname + ".log", "w") with pysam.FastxFile(args.read1) as fh, pysam.FastxFile(args.read2) as fh2: n = 0 y = 0 for record_fh, record_fh2 in zip(fh, fh2): barcode = record_fh.sequence[0:24] y += 1 for b in barcodes: if Levenshtein.distance(barcode, b) <= int(args.distance): n += 1 b = b + record_fh.sequence[24:] outf.write("@%s\n%s\n+\n%s\n" % (record_fh.name, b[::2], record_fh.quality[::2])) outf2.write( "@%s\n%s\n+\n%s\n" % (record_fh2.name, record_fh2.sequence, record_fh2.quality)) break else: pass log.write("The number of total reads is: %s\n" % (y)) log.write("The number of total recovered reads is: %s\n" % (n))