def getSuccessors(self, statement, i, goalCoHash = None): if statement.type(i) == "conjunction" or statement.type(i) == "disjunction": thisType = statement.type(i) if thisType == "conjunction": otherType = "disjunction" else: otherType = "conjunction" successors = [] if statement.type(i*2+2)==otherType: # ie p & (q v r); thisType=="conjunction", otherType = "disjunction" p = statement.childTree(i*2+1) q = statement.childTree(i*4+5) r = statement.childTree(i*4+6) if p==q or p==r: successor = statement.graft(i,p) successor.action = self.name successor.cost = self.cost + distance(str(statement), str(successor)) successors.append(successor) if statement.type(i*2+1)==otherType: # ie (q v r) & p; thisType=="conjunction", otherType = "disjunction" p = statement.childTree(i*2+2) q = statement.childTree(i*4+3) r = statement.childTree(i*4+4) if p==q or p==r: successor = statement.graft(i,p) successor.action = self.name successor.cost = self.cost + distance(str(statement), str(successor)) successors.append(successor) if len(successors) == 1: return successors[0] elif len(successors) > 1: return successors else: return None
def getSuccessors(self, statement, i, goalCoHash = None): if statement.type(i) == "conjunction": if statement.type(i*2+1)=="false_constant": successor = statement.childTree(0) successor.prune(i) successor.insertProp(i, "false_constant") successor.action = self.name successor.cost = self.cost + distance(str(statement), str(successor)) return successor elif statement.type(i*2+2)=="false_constant": successor = statement.childTree(0) successor.prune(i) successor.insertProp(i, "false_constant") successor.action = self.name successor.cost = self.cost + distance(str(statement), str(successor)) return successor elif statement.type(i) == "disjunction": if statement.type(i*2+1)=="true_constant": successor = statement.childTree(0) successor.prune(i) successor.insertProp(i, "true_constant") successor.action = self.name successor.cost = self.cost + distance(str(statement), str(successor)) return successor elif statement.type(i*2+2)=="true_constant": successor = statement.childTree(0) successor.prune(i) successor.insertProp(i, "true_constant") successor.action = self.name successor.cost = self.cost + distance(str(statement), str(successor)) return successor
def get_min_score(tokens, img_name): min_dist = distance(tokens[0], img_name) for i in range(1,len(tokens)): dist = distance(tokens[i], img_name) if min_dist > dist: min_dist = dist return min_dist
def search(start,goal,rules,verbose = False): goalStr = str(goal) # l = len(str(start))+len(str(goal)) nodesExpanded = 0 shortcuts = 0 node = Node(start, None) node.cost = distance(str(node.state), goalStr) frontier = PriorityQueue() frontier.push(node,node.cost) explored = set() while not frontier.isEmpty(): if nodesExpanded > 2000: raise TimeOutException() node = frontier.pop() if nodesExpanded%10==0: print nodesExpanded nodesExpanded += 1 if node.state == goal: # print "expanded: ", nodesExpanded, " shortcuts: ", shortcuts print "expanded: ", nodesExpanded, " shortcuts: ", shortcuts return Derivation(start,goal,node.traceback(),rules) explored.add(node.state) for child in node.successors(rules,goal): h = distance(str(child.state), goalStr) if child.state not in explored and frontier.getCheapestCost(child) == -1: frontier.push(child, child.cost + h) if verbose: print child.cost, child.state, h elif frontier.getCheapestCost(child) > child.cost: shortcuts += 1 frontier.push(child, child.cost + h) print "NOT LOGICALLY EQUIVALENT" return False
def mate_pop(top, population): new_pop = [] new_pop.append((distance(top, TARGET), top)) for pair in population: string = mutate(crossover(pair[1], top)) d = distance(string, TARGET) new_pop.append((d, string)) new_pop.pop() new_pop.sort() return new_pop
def return_operon_string_distance(operon_string, gene_string): len_operon = len(operon_string) len_gene_group = len(gene_string) length_difference = len_operon - len_gene_group reverse_gene_string = gene_string[::-1] d1 = distance(operon_string, gene_string) - length_difference d2 = distance(operon_string, reverse_gene_string) - length_difference return min(d1, d2)
def getSuccessors(self, statement, i, goalCoHash = None): #maybe make commutativity discount its cost when it's close to the goalCoHash, using the older equivalence hashing method that ignores order for commuativ operators if statement.type(i) == "conjunction" or statement.type(i) == "disjunction": left = statement[i*2+1] right = statement[i*2+2] successor = statement.graft(i*2+1,right) successor.graftInPlace(i*2+2,left) successor.action = self.name if statement.cohash()==goalCoHash: successor.cost = self.cost + distance(str(statement), str(successor)) else: successor.cost = self.cost + 10*distance(str(statement), str(successor)) return successor
def match(self, equipe): if self.categorie and not self.categorie.valide(equipe): return False equipiers_challenge = Equipier.objects.filter(equipe__challenges__participation=self) c = 0 equipiers = equipe.equipier_set.all() for e in equipiers: for e2 in equipiers_challenge: if e.justificatif == 'licence' and e2.justificatif == 'licence' and e.num_licence == e2.num_licence: c += 1 elif distance(e.nom.lower(), e2.nom.lower()) < 3 and distance(e.prenom.lower(), e2.prenom.lower()) < 3: c += 1 return c >= len(equipiers) / 2
def getSuccessors(self, statement, i, goalCoHash = None): if statement.type(i) == "negation": if statement.type(i*2+1) == "conjunction" or statement.type(i*2+1) == "disjunction": thisType = statement.type(i*2+1) if thisType == "conjunction": otherType = "disjunction" else: otherType = "conjunction" np = statement.negatedChildTree(i*4+3) nq = statement.negatedChildTree(i*4+4) ns = Statement(dict(),statement.propMap) ns.insertProp(0, otherType) ns.graftInPlace(1,np) ns.graftInPlace(2,nq) successor = statement.graft(i,ns) successor.action = self.name successor.cost = self.cost + distance(str(statement), str(successor)) return successor elif statement.type(i) == "conjunction" or statement.type(i) == "disjunction": thisType = statement.type(i) if thisType == "conjunction": otherType = "disjunction" else: otherType = "conjunction" if self.dangerous: np = statement.negatedChildTree(i*2+1) nq = statement.negatedChildTree(i*2+2) ns = Statement(dict(),statement.propMap) ns.insertProp(0, "negation") ns.insertProp(1, otherType) ns.graftInPlace(3,np) ns.graftInPlace(4,nq) successor = statement.graft(i,ns) successor.action = self.name successor.cost = self.cost + distance(str(statement), str(successor)) return successor else: if statement.type(i*2+1) == "negation" and statement.type(i*2+2) == "negation": p = statement.childTree(i*4+3) q = statement.childTree(i*4+5) ns = Statement(dict(),statement.propMap) ns.insertProp(0, "negation") ns.insertProp(1, otherType) ns.graftInPlace(3,p) ns.graftInPlace(4,q) successor = statement.graft(i,ns) successor.action = self.name successor.cost = self.cost + distance(str(statement), str(successor)) return successor
def getSuccessors(self, statement, i, goalCoHash = None): if statement.type(i) == "conjunction" or statement.type(i) == "disjunction": thisType = statement.type(i) if thisType == "conjunction": otherType = "disjunction" else: otherType = "conjunction" successors = [] if statement.type(i*2+2)==otherType: # ie p & (q v r); thisType=="conjunction", otherType = "disjunction" p = statement.childTree(i*2+1) p2 = statement.childTree(i*2+1) q = statement.childTree(i*4+5) r = statement.childTree(i*4+6) successor = statement.childTree(0) successor.prune(i) successor.insertProp(i,otherType) # _ v _ successor.insertProp(i*2+1,thisType) # (_ & _) v _ successor.insertProp(i*2+2,thisType) # (_ & _) v (_ & _) successor.graftInPlace(i*4+3,p) # (p & _) v (_ & _) successor.graftInPlace(i*4+4,q) # (p & q) v (_ & _) successor.graftInPlace(i*4+5,p2) # (p & q) v (p2 & _) successor.graftInPlace(i*4+6,r) # (p & q) v (p2 & r) successor.action = self.name successor.cost = self.cost + distance(str(statement), str(successor)) successors.append(successor) if statement.type(i*2+1)==otherType: p = statement.childTree(i*2+2) p2 = statement.childTree(i*2+2) q = statement.childTree(i*4+3) r = statement.childTree(i*4+4) successor = statement.childTree(0) successor.prune(i) successor.insertProp(i,otherType) successor.insertProp(i*2+1,thisType) successor.insertProp(i*2+2,thisType) successor.graftInPlace(i*4+3,p) successor.graftInPlace(i*4+4,q) successor.graftInPlace(i*4+5,p2) successor.graftInPlace(i*4+6,r) successor.action = self.name successor.cost = self.cost + distance(str(statement), str(successor)) successors.append(successor) if len(successors) == 1: return successors[0] elif len(successors) > 1: return successors else: return None
def check_distances(combined_hcv, report_file=None): references = [(header, sequence.replace('-', '')) for header, sequence in iterate_fasta(combined_hcv) if header.startswith('Ref.')] combined_hcv.seek(0) samples = ((header, sequence.replace('-', '')) for header, sequence in iterate_fasta(combined_hcv) if header.startswith('Sample.')) for header, sequence in samples: reported_genotype = header.split('-')[-1] reported_ref = best_ref = min_distance = reported_distance = None reported_size = best_size = 0 for ref_header, ref_seq in references: ref_genotype = ref_header.split('-')[-1] d = distance(sequence, ref_seq) if (ref_genotype == reported_genotype and (reported_distance is None or d < reported_distance)): reported_distance = d reported_ref = ref_header reported_size = len(ref_seq) if min_distance is None or d < min_distance: min_distance = d best_ref = ref_header best_size = len(ref_seq) if min_distance != reported_distance: best_genotype = best_ref.split('-')[-1] print(f'Reported {reported_genotype}, but {best_genotype} is ' f'closer: {header}(0/{len(sequence)}), ' f'{reported_ref}({reported_distance}/{reported_size}), ' f'{best_ref}({min_distance}/{best_size}).', file=report_file)
def extract_option(text): """ Return the Option referenced by ``text`` (in various fuzzy ways), or raise a ValueError if none could be found. This function tries hard to find an Option, as sort-of documented by the ``tests`` module. """ if not isinstance(text, basestring): raise TypeError("Not a basestring: %r" % text) t = unicode(text).lower() matches = [] for option in Option.objects.all(): l = option.letter.lower() c = option.caption.lower() # return early if this is an exact match. if (t == l) or (t == c): return option # otherwise, compile a list of distances. d = distance(t, c) if (d is not None) and (d <= settings.MAX_MATCH_DISTANCE): matches.append((option, d)) # return the closest option. if len(matches): m = sorted(matches, key=lambda x: x[1]) return m[0][0] raise ValueError("No Option could be found in: %s" % text)
def find_similar_names(search_name, base, default_distance): similar_names = list() for mpid in base.keys(): # mpid, name, link, party, ticket, district,\ # rid, rdate, urid, urdate, urreason,\ # bio, profile, party12, ticket12, link12,\ # district12, did12, dlink12, loh, lohcom,\ # corrupt, autobio, biolink, decl, decllink name = base[mpid][1] district = base[mpid][5] dist = list() for pair in zip(search_name, name): search_name_el, name_el = pair if len(search_name_el) == 1: name_el = name_el[0:1] current_dist = distance(search_name_el, name_el) dist.append(current_dist) if len(search_name) == len(name) and sum(dist) == 0: return [[mpid, name, district]] if all(d < default_distance for d in dist): similar_names.append([mpid, name, district]) return similar_names
def anomalies(self, request): request.current_app = self.name uid = request.COOKIES['course_uid'] course = Course.objects.get(uid=uid, accreditations__user=request.user) equipiers = list(Equipier.objects.filter(equipe__course=course).select_related('equipe__categorie')) doublons = [] for i, e in enumerate(equipiers): if e.numero > e.equipe.nombre: continue dbl = [] for j in range(i + 1, len(equipiers)): e2 = equipiers[j] if e2.numero > e2.equipe.nombre: continue if distance((e.nom + ' ' + e.prenom).lower(), (e2.nom + ' ' + e2.prenom).lower()) < 3: dbl.append(e2) if dbl: dbl.insert(0, e) doublons.append(dbl) print(doublons) return TemplateResponse(request, 'admin/anomalies.html', dict(self.each_context(request), doublons=doublons, course=course, ))
def fix_garbage_sugar(pairs): betterPairs = [] i = -1 done = False for pair in pairs: slide = (len(pair[0]) - 6) + 1 if(slide < 1): slide = 1 for i in range(slide): temp = pair[0] #print(temp[i:(5 + i)]) dist = distance(Keywords.label.sugars, temp[i:(5 + i)]) if dist <= 2: i = pair[2] done = True break if(done): break for pair in pairs: name = pair[0] if(pair[2] == i): name = Keywords.label.sugars betterPairs.append((name, pair[1], pair[2])) return betterPairs
def generate_aliases(table, ref_list, match_list, dist_limit=3): comps = 0.0 total_comps = float(max(1, len(ref_list) * len(match_list))) for ref in ref_list: if not table.find_one(name=ref["name"]): table.insert({"name": ref["name"], "fp": ref["fp"], "canonical": ref["name"]}) for match in match_list: dist = distance(match["fp"], ref["fp"]) comps += 1.0 if comps and comps % 100000 == 0: pct_comps = int((comps / total_comps) * 100) print "%s matching: %s%%" % (table.table.name, pct_comps) if dist < dist_limit: if not table.find_one(name=match["name"]): table.insert( { "name": match["name"], "fp": match["fp"], "candidate": ref["name"], "distance": dist, "canonical": match["name"], } ) # print 'Match? %r -> %r' % (ref['name'], match['name']) write_aliases(table)
def search_by_similar_name(self, genus, species): """ Search for Species with a similarly spelled name as the given name. This method can help correct spelling mistakes in species names. """ matches = self.filter( genus__startswith=genus[:2], genus__endswith=genus[-2:], species__startswith=species[:2], species__endswith=species[-2:] ) complete_name = u" ".join((genus, species)) min_match = 10 min_match_species = None for match in matches: d = distance(complete_name, unicode(match)) if d < min_match: min_match = d min_match_species = match if min_match_species and d < 3: species_by_fullname[complete_name] = min_match_species else: species = Species.objects.create(genus=i[0], species=i[1]) species_by_fullname[complete_name] = species
def clustering(self, elems): """ Clusterize the input elements. Input: list of words (e.g. list of URLs). It MUST be sorted! Process: build a dictionary where keys are cluster IDs (int) and values are lists (elements in the given cluster) """ clusters = {} cid = 0 for i, line in enumerate(elems): if i == 0: clusters[cid] = [] clusters[cid].append(line) else: last = clusters[cid][-1] if distance(last, line) <= DISTANCE: clusters[cid].append(line) else: cid += 1 clusters[cid] = [] clusters[cid].append(line) # self.clusters['clusters'] = clusters self.clusters['clusters']['largest'] = self.get_largest_cluster() self.clusters['clusters']['number_of_clusters'] = cid + 1
def getTemplateNoDiac(word): """This function take an Arabic word as parameter; it deletes its diacritics if exist; then returns the possible Templates; If there are many possible templates, they will be separated with + """ template = u"" minDistance = 1000 word_u = deleteDiacritics(word) word_u = unicode(word_u) for wazn in wazns: wazn_u = deleteDiacritics(wazn) wazn_u = deleteRoot(wazn_u) wazn_u = unicode(wazn_u) if len(wazn_u) != len(word_u): continue #print "distance(" + word_u + "," + wazn_u + ")" distanceI = distance(word_u, wazn_u) if distanceI < minDistance: if re.match(wazn_u, word_u): minDistance = distanceI template = wazn continue if distanceI == minDistance: if re.match(wazn_u, word_u): template = template + '+' + wazn return template
def optional_check(): "Optionally check for sentences that we failed to match." print 'Not found:' not_found = {s for h,s in negation_sentence_hashes.items() if not h in found} print '\n'.join(not_found) from Levenshtein import distance print '' print 'Computing Levenshtein distances to find candidates we could have missed.' found_missing = False for split in ['train']: for key in old_file[split]: for description in old_file[split][key]['descriptions']: for sentence in not_found: h1 = sentence_hash(sentence) h2 = sentence_hash(description) if distance(h1, h2) < 15: found_missing = True print 'POSSIBLE MATCH:' print sentence print description print '----------------------------------' if not found_missing: print 'Matched all we could possibly match.'
def main(): try2() return population = [] population = givemepop(population) print population for i in range(0, CYCLES): # for i in range(0, POP_SIZE): # s = givemestring(len(TARGET)) # d = distance(s, TARGET) # population.append((d, s)) # population.sort() mated = crossover(population[0][1], population[1][1]) d = distance(mated, TARGET) new_pop = [] new_pop.append((d, mated)) new_pop.append(population[0]) new_pop.append(population[1]) # for i in range(0, POP_SIZE - len(new_pop)): # s = givemestring(len(TARGET)) # d = distance(s, TARGET) # new_pop.append((d, s)) # new_pop.sort() new_pop = givemepop(new_pop) print new_pop population = new_pop
def admin_season(selected_season): # Get and validate season seasons = config.get_all_seasons() if not selected_season in seasons: abort(404, "Season data not found") season_data = config.get_season_data(selected_season) # Do we need to recalculate points? if 'action' in request.form and request.form['action'] == 'recalculate_points': season_data.calc_and_store_points() return jsonify() # Otherwise, just display the season page. Find similar drivers in real-time, before we do this results_table = season_data.get_results_for_class() d = sorted([row["driver"] for row in results_table.table]) similar_drivers = [] for i in xrange(len(d)): for j in xrange(i + 1, len(d)): dist = distance(d[i], d[j]) if dist <= 4: similar_drivers.append({"name1": d[i], "name2": d[j], "distance": dist}) similar_drivers = sorted(similar_drivers, key=itemgetter("distance")) driver_name_corrections = config.get_driver_name_corrections(selected_season) return render_template("admin.html", seasons=seasons, selected_season=selected_season, season_data=season_data, similar_drivers=similar_drivers, driver_name_corrections=driver_name_corrections)
def get_departures_by_station(self, station): """ Get list of Departures for one station """ # TODO 1. Error handling # TODO 2. more error handling # TODO 3. ultimative error handling station = station.encode('UTF-8') html = urlopen(defaults.departures_by_station % quote_plus(station)).read() li = BeautifulSoup(html).ul.findAll('li') if li[0].a: # calculate levenshtein distance of results st = map(lambda x: (distance(station, x.a.text.encode('UTF-8')), x.a.text.encode('UTF-8'), x.a['href']), li) # take result with lowest levenshtein distance s = min(st) lnk = s[2] if len(st) > 1: print "Multiple results found, using best match:", s[1] html = urlopen(defaults.qando + lnk).read() dep = self.parse_departures_by_station(html) return dep
def calc_similarity(word1, word2): len1 = len(word1) len2 = len(word2) l = len1 if l < len2: l = len2 return 100 - (100 / l) * distance(word1, word2)
def unshred(self, output): distances = {} totheright = {} for key in self.strips: min_k = None min_d = None for key2 in self.strips: if key != key2: d = distance(self.strips[key]['right'], self.strips[key2]['left']) if min_k == None: min_k = key2 min_d = d else: if d < min_d: min_k = key2 min_d = d print '... strip %d closest match is %d (%d)' % (key, min_k, min_d) distances[key] = min_d totheright[key] = min_k right_most = max(distances, key=distances.get) print 'We think that strip %d is the is the right-most strip!' % right_most del(totheright[right_most]) pprint(totheright) self.ordered = [right_most] while totheright: for key in totheright.keys(): if totheright[key] == self.ordered[0]: self.ordered.insert(0, key) del(totheright[key]) print 'Here\'s our order:', self.ordered self._save(output)
def test_convert(): # There are 4 amino acid changes, but because codons are chosen randomly, # based on experimental tests, there could be anywhere between 8 and 12 # changes, inclusive. d = distance(str(np.src_nt.seq), str(np.des_nt.seq)) assert d <= 12 assert d >= 8
def matching(a, b): if a.lower() in b.lower() or b.lower() in a.lower(): # substring return True if get_jaccard(a, b) >= .3: return True if distance(a, b) < 3: return True return False
def getSuccessors(self, statement, i, goalCoHash = None): if statement.type(i) == "conjunction" or statement.type(i) == "disjunction": left = statement[i * 2 + 1] if left == statement[i * 2 + 2]: successor = statement.graft(i, left) successor.action = self.name successor.cost = self.cost + distance(str(statement), str(successor)) return successor
def givemepop(initialpop): result = initialpop for i in range(len(initialpop), POP_SIZE): s = givemestring(len(TARGET)) d = distance(s, TARGET) result.append((d, s)) result.sort() return result
def check_perms(names1, names2): # try removing one until distance is low enough for name in names1: spliced = filter(lambda n: n != name, names1) if distance(' '.join(spliced), ' '.join(names2)) <= thres: #print 'YAY, found match by trying permutations! (%s, %s)' % (' '.join(spliced), ' '.join(names2)) return True return False
def safe_distance(a, b): a = RX_DOT.sub('', a) b = RX_DOT.sub('', b) if a == b: return 0 if len(a) == 0 or len(b) == 0: return max(len(a), len(b)) + 1 if a[0] == b[0] and len(a) == 1 and len(b) > 3: return 0 if a[0] == b[0] and len(b) == 1 and len(a) > 3: return 0 if len(a) < 3 or len(b) < 3: return max(len(a), len(b)) + 1 return distance(a, b)
def test_levenshtein(self): eq(distance('a', 'ab'), 1) # number of additions, deletions, updates eq(ratio('a', 'b'), 0) # in [0, 1] eq(ratio('a', 'a'), 1) eq(setratio(['a', 'b'], ['b', 'a']), 1.0) # in [0, 1] compares two sets by best fit, order doesnt matter eq(setratio(['c', 'd'], ['b', 'a']), 0) # in [0, 1] compares two sets by best fit, order doesnt matter eq(seqratio(['a', 'b'], ['b', 'a']), 0.5) # in [0, 1] eq(seqratio(['a', 'b'], ['a', 'b']), 1.0) # in [0, 1] eq(seqratio(['a'], ['a', 'b']), 2 / 3)
def find_nearest(title, title_dict): if title is None: return None if title in title_dict: return title_dict[title] min_dist = EDIT_DISTANCE_RATIO_THRESHOLD res = None for key in title_dict.keys(): dist = distance(title, key) / max(len(title), len(key)) if dist < min_dist: min_dist = dist res = title_dict[key] return res
def compare_metadata(prev, potential, mood): linked = zip(prev['metadata'], potential['metadata']) diffs = [] for idx, items in enumerate(linked): old, new = items if idx == 5: # dates olddate, newdate = 0,0 if len(old) == 4: olddate = int(old) if len(new) == 4: newdate = int(new) diffs.append(np.abs(newdate-olddate)) else: try: if old != "" and new != "": diffs.append(distance(unicode(old), unicode(new)) / float(len(old) + len(new) + 1) ) else: diffs.append(100000) except: diffs.append(distance((old), str(new))) weighted_diffs = map(lambda x: x[0]*x[1], zip(diffs, mood['metadata'])) return weighted_diffs
def test_khash(xs, D2, attempts=1e6): n = len(xs) tests = 0 for _ in range(int(attempts)): i, j = np.random.randint(n, size=2) a, b = xs[i], xs[j] d = distance(a, b) if 0 < d < 3: key = tuple(sorted((a,b))) if key not in D2: print 'fuckyou' else: tests += 1 return tests
def one_char_typosquatting(s_a='', s_b=''): """ function searches for one character typosquatting for strings of length at least 4 types of one char typosquatting: inplace one char: paypal -> paypel paypal -> paypai paypal -> qaypal inflate one char: paypal -> paypal2 paypal -> payypal paypal -> ppaypal deflate one char: paypal -> payal paypal -> papal switched neighbour chars: paypal -> papyal paypal -> payapl """ if not s_a or not s_b or s_a == s_b: # nothing to compute return False if len(s_a) < 4 and len(s_b) < 4: return False # Levenshtein distance handle inplace, inflate and deflate one char if distance(strip_accents(s_a), strip_accents(s_b)) == 1: return True # try for find switched neighbours if not len(s_a) == len(s_b): return False for i in range(0, len(s_a) - 1): t = s_a[i:i + 2][::-1] switched_neighbours = ''.join(( s_a[:i] if i > 0 else '', t, s_a[i + 2:], )) if switched_neighbours == s_b: return True return False
def match_maker(query, unknown): """ This little ditty gives us some wiggle room in identifying our indices and any other small targets. :param query :param unknown :return: """ query_mismatch = distance(query, unknown) # Unknown length can be longer than target length. Need to adjust mismatch index to reflect this. adjusted_query_mismatch = query_mismatch-(len(unknown) - len(query)) return adjusted_query_mismatch
def eval_unnatural(stem, most_freq): """ 高頻度の単語に発音が似ていないか """ stem_meta = dmeta(stem)[0] most_freq_meta = [dmeta(x)[0] for x in most_freq] distances = [distance(stem_meta, x) for x in most_freq_meta] if 1 in distances: # 高頻度の単語に似ている.規則化すると不自然 return -5.0 else: # 高頻度の単語に似ていない return 0
def last_chance(self, kw): w = self.dictionnary no_find = True i = 0 while no_find: test_wd = clean(basify(kw)).lower() keep = np.where(np.asarray([distance(test_wd, s) for s in w]) == i) if len(keep[0]) > 0: no_find = False final = keep[0][0] i = i + 1 if i > 3: final = 'unk' return final
def suggest_v1(self, word): # intrie = lambda x: x in self.trie or x in self.secondary_trie or x in self.book_trie def intrie(candidate): if (candidate in self.secondary_trie or candidate in self.trie or candidate in self.book_trie): return True else: return False candidates = list(self.edits1(word) or self.edits2(word)) in_dictionary = list(filter(intrie, candidates)) suggestions = sorted(in_dictionary, key=lambda x: distance(x, word)) n = min(10, len(suggestions)) return suggestions
def single_barcode_adj(b, BClist): if b in BClist: return [b, "match"] else: dis_to_refer = [] for referBC in BClist: dis_to_refer.append(distance(b, referBC)) sorted_dis_to_refer = sorted(dis_to_refer) if sorted_dis_to_refer[0] <= 1 and \ sorted_dis_to_refer[1] - sorted_dis_to_refer[0] >= 1:# \ #and dis_to_refer.index(sorted_dis_to_refer[0]) <= 1: return [BClist[dis_to_refer.index(sorted_dis_to_refer[0])], "adj"] else: return ["NA", "rm"]
def filterLevenshtein(msg, filterWords, englishWords, levenshteinDistance): #Filter special characters from string filteredMsg = ''.join(e for e in msg if e.isalnum()) #for each word in filter list for word in filterWords: #for all criteria check if it's a proper english word before filtering it #If a word matches a filter criteria, and is not an english word, the asterisk string is returned if abs(len(word) - len(filteredMsg) <= levenshteinDistance): if distance(word, filteredMsg) <= levenshteinDistance: if not WordChecker.check_word_exists_in( englishWords, filteredMsg): return generateRandomAsteriskString() # Otherwise, return original string return msg
def _get_closest_string(string, iterable, length_dependant: bool = True, preprocess=lambda s: s.lower()): string = preprocess(string) iterable = list(filter(lambda x: x != None, iterable)) distances = sorted({ s: distance(string, preprocess(s)) / (max(len(preprocess(s)), 0.01) if length_dependant else 1) for s in iterable }.items(), key=lambda i: i[1]) if len(distances) > 0: return distances[0][0] return string
def score( self, query_meta: Dict, old_match_scores: Dict, match_meta: Dict ) -> "np.ndarray": from Levenshtein import distance new_scores = [ ( match_id, -distance( query_meta['text'], match_meta[match_id]['text'] ), ) for match_id, old_score in old_match_scores.items() ] return np.array(new_scores, dtype=np.float64)
def weather(): #天气现象查询 print('\n本脚本只提供查询编码功能') while 1: dic={'露':'01','霜':'02','结冰':'03','大风':'15','积雪':'16','雾凇':'48','雨凇':56,'冰雹':89,'霾':'05','浮尘':'06','扬沙':'07','轻雾':10,'沙尘暴':31,'雾':42,'毛毛雨':50,'雨':60,'雨夹雪':68,'雪':70,'阵雨':80,'阵性雨夹雪':83,'阵雪':85} key=input('\n请输入要查询的天气现象,或输入-1退出:') if key=='-1':break try: print('编码为:',dic[key]) except KeyError as e: print('未找到相关关键词:',e,'\n\n你可能是想写:') for i in dic.keys(): f=distance(key.encode('unicode_escape'),i.encode('unicode_escape')) if f<5: print(i,end=' ')
def levenshtein(string, candidates): """ Compare a string's Levenshtein distance to each candidate in a dictionary. Expands the length of each candidate to match the length of the compared string Returns the name of the closest match """ distances = defaultdict(int) num_lines = len(string) for k, v in candidates.items(): expanded = (v * (num_lines // len(v) + 1))[:num_lines] distances[k] += distance(string, expanded) return get_lowest(distances)
def get_reduced_distances(chunk, edit_distance): """ Now that we've reduced the data set, we need to actually look at these tags that appear to have the most other tags some > min(edit_distance) from them. We're going to do this by comparing each tag to the "base" tag that got it included in this set to begin with, and also comparing each tag that we keep to all other tags that we keep to ensure that none are less than min(edit_distance from one another). This was a stuggle to do simply and without consuming LOTS of RAM (e.g. numpy arrays), but the solution is rather simple. """ all_keepers = [] # get only those tags, compared to the base tag that have # edit_distance >= our minimum - we're essentially regenerating # and filtering the pairwise comparisons above good_comparisons = [c for c in chunk[1] if \ distance(chunk[0],c[1]) >= edit_distance] # we know that the first tag is good (it is the basis for comparison), # so keep that one keepers = [chunk[0]] # now, loop over all the tags in the reduced set, checking each against # the tags already in 'keepers' for the proper edit distance for tag in good_comparisons: #pdb.set_trace() temp_dist = [] skip = False for keep in keepers: d = distance(keep,tag[1]) if d < edit_distance: skip = True # no need to continue if we're already < edit_distance break if not skip: keepers.append(tag[1]) # see docstring for pickler tf = pickler(keepers) return tf
def iterate_insde_dict(collected_words_list, handled_ids, word, lemma, pos,examined_word_len, letter_count_dict, search_range, debug = DEBUG): collected_words = 0 #print(word, examined_word_len, "search_range", search_range) if str(examined_word_len) in letter_count_dict and examined_word_len > 2: compare_words = letter_count_dict[str(examined_word_len)] random.shuffle(compare_words) for word_compare_el in compare_words: word_compare = word_compare_el[0] if "unknown" not in word_compare_el[1]:#SPECIFIC pos_compare = word_compare_el[1] else: pos_compare = None lemm_comapre = word_compare_el[2] comp_word_id = word_compare_el[3] comp_ref_id = word_compare_el[4] comp_set_id = word_compare_el[5] if search_range > 3: pos_search_range = search_range else: pos_search_range = 1 if pos and pos_compare: if distance(lemm_comapre, lemma) > 0 and distance(lemm_comapre, lemma) <= search_range and distance(pos_compare, pos) <= pos_search_range and comp_word_id not in handled_ids: if debug:print("FOUND VS POS", word,word_compare,lemma, lemm_comapre, pos,pos_compare) collected_words_list.append({"word_id":comp_word_id, "ref_id":comp_ref_id,"setting_id":comp_set_id, "ngramm": word_compare}) handled_ids.append(comp_word_id) collected_words += 1 if (collected_words > 10 or len(collected_words_list) > 12):break else: if distance(lemm_comapre, lemma) > 0 and distance(lemm_comapre, lemma) <= search_range and comp_word_id not in handled_ids: if debug:print("FOUND NON POS", word,word_compare,lemma, lemm_comapre) collected_words_list.append({"word_id":comp_word_id, "ref_id":comp_ref_id,"setting_id":comp_set_id, "ngramm": word_compare}) handled_ids.append(comp_word_id) collected_words += 1 if (collected_words > 10 or len(collected_words_list) > 12):break #print("collected_words_list",collected_words_list) return collected_words
def buildListDict(input_file, distance_stringency, pickleOut): #Dict format: {'UMI_1' : (Seqs, First_Header, First_quality), 'UMI_2' : (Seqs, First_Header, First_quality)} sequences = defaultdict(lambda:([],[],[])) target = open(input_file, 'r') umi_list = [] position = 1 is_unique = True for line in target: if position == 1: header = line.rstrip('\n') position += 1 elif position == 2: #Assumes UMI is flanking first and last 6bp of read #umi_seq = line[0:11]+line[-7:] #Abs dist from start/end compatible with miSeq/hiSeq umi_seq = line[0:11]+line.rstrip('\n')[-11:] #Abs dist from start/end compatible with miSeq/hiSeq umi_seq = umi_seq.rstrip('\n') read_seq = line.rstrip('\n')[6:-6] position += 1 elif position == 3: position += 1 elif position == 4: quality = line.rstrip('\n') position = 1 if not bool(umi_list): umi_list.append(umi_seq) else: is_unique = True for umi in umi_list: if is_unique: if distance(umi_seq, umi) <= distance_stringency: is_unique = False umi_seq = umi # it is important for duplex collapsing to make sure reads are of the same length # when not duplex collapsing this should always be true if not is_unique and len(sequences[umi_seq][0][0]) == len(read_seq): sequences[umi_seq][0].append(read_seq) elif is_unique: sequences[umi_seq][0].append(read_seq) #check if header slot is empty prevents multiple entries error if is_unique and not bool(sequences[umi_seq][1]): sequences[umi_seq][1].append(header) sequences[umi_seq][2].append(quality) target.close() return sequences
def attach_UMI_barcode(Read1, Read2, barcodes, mismatch_rate=1): mismatch_rate = int(mismatch_rate) f1 = open(Read1) f2 = open(Read2) line1 = f1.readline() line2 = f2.readline() while (line1): line1 = f1.readline() target = line1[0:6] mismatch = [ distance(target, barcodes[idx]) for idx in range(len(barcodes)) ] if (min(mismatch) <= mismatch_rate): barcode = barcodes[mismatch.index(min(mismatch))] output_file = barcode + ".align.fastq" f3 = open(output_file, "a") UMI = line1[6:12] first_line = "@" + barcode + "," + UMI + "," + line2[1:] f3.write(first_line) second_line = f2.readline() f3.write(second_line) third_line = f2.readline() third_line = "+" + barcode + "," + UMI + "," + third_line[1:] f3.write(third_line) fourth_line = f2.readline() f3.write(fourth_line) line2 = f2.readline() else: line2 = f2.readline() line2 = f2.readline() line2 = f2.readline() line2 = f2.readline() line1 = f1.readline() line1 = f1.readline() line1 = f1.readline() f1.close() f2.close() f3.close()
def calc_distance_matrix(data): """Calculate a distance matrix between languages. 1. Calculates Levensthein distance for every word between two languags (assuming they both have the given concept) 2. Normalizes the Levensthein distance: words_d / longest_word_d 3. Calculates language distance: word1_d + word2_d + ... + wordn_d / amount of words Returns: the distance matrix (Pandas DataFrame) """ d_matrix = np.zeros(shape=(len(data.keys()), len(data.keys()))) i = 0 for lang1, lang1_dict in data.items(): j = 0 for lang2, lang2_dict in data.items(): lv_distances = [] for w_concept1, word1_l in lang1_dict.items(): word2_l = lang2_dict.get(w_concept1, None) # if word concept not in list if word2_l is None: continue dis_list = [] # some words have more than one translation for a given concept # I take the smallest distance as given one for w1 in word1_l: for w2 in word2_l: dis_list.append(distance(w1, w2)) lv_distances.append(np.min(dis_list)) longest_wd = np.max(lv_distances) if longest_wd > 0: lv_distances = [x / longest_wd for x in lv_distances] lang_dist = np.sum(lv_distances) / len(lv_distances) d_matrix[i][j] = lang_dist j += 1 i += 1 d = DataFrame(d_matrix) d.index = data.keys() d.columns = data.keys() return d
def distance(self, other): p1 = self.get_item_parts() p2 = other.get_item_parts() # drop Id: p1 = p1[1:] p2 = p2[1:] dists = [] for i in range(len(p1)): s1 = p1[i] if p1[i] != "" else None s2 = p2[i] if p2[i] != "" else None if (s1 is not None) and (s2 is not None): dists.append(float(distance(s1, s2))/max(len(s1), len(s2))) return dists
def insert(self, word): """Returns parent of new node.""" if self.word is None: self.word = word return 'none' if self.word == word: return 'not_inserted' else: dist = distance(word, self.word) for child in self.children: if child[1] == dist: return child[0].insert(word) self.children.append((SpellTree(word), dist)) return self.word
def find_music(app, root): tolaunch="" minimum=999999 path = root papth = os.walk(path) for root, dirnames, filenames in os.walk(path): for filename in filenames: filePath = os.path.join(root, filename) r=int(distance(filename, app)) #print("app : "+str(name)+" distance : "+str(r)) if r<minimum: tolaunch=filePath minimum=r print("actual best : "+str(tolaunch)+ " distance : "+str(minimum)) os.system("open \""+tolaunch+"\"")
def get_syn_sim(self, node1, node2): """ 根据传入的节点返回句法相似性 :param node1: :param node2: :return: """ node1 = node1[1]['node_name'] node2 = node2[1]['node_name'] distance_val = distance(node1, node2) if len(node1) == 0 and len(node2) == 0: return 1 else: syn_sim = 1 - distance_val / max(len(node1), len(node2)) return syn_sim
def validate_password_dictionary(value): """ Insures that the password is not too similar to a defined set of dictionary words """ password_max_edit_distance = getattr( settings, "PASSWORD_DICTIONARY_EDIT_DISTANCE_THRESHOLD", None) password_dictionary = getattr(settings, "PASSWORD_DICTIONARY", None) if password_max_edit_distance and password_dictionary: for word in password_dictionary: edit_distance = distance(text_type(value), text_type(word)) if edit_distance <= password_max_edit_distance: raise ValidationError( _("Too similar to a restricted dictionary word."), code="dictionary_word")
def compute_speaker_Levenshtein_distance(speaker_name): full_speaker_names = read_names("APnames.xlsx") # speaker_last_names = read_names("an_last_names.xls") distance_size = {} for i, speaker in enumerate(full_speaker_names['Full Name']): # Levenshtein distance # speaker = unicodedata.normalize("NFKD", speaker).encode("ascii", "ignore") dist = distance(speaker, speaker_name) distance_size[speaker] = dist for j, speaker in enumerate(full_speaker_names.index.values): # Levenshtein distance # speaker = unicodedata.normalize("NFKD", speaker).encode("ascii", "ignore") dist = distance(speaker, speaker_name) full_name = full_speaker_names["Full Name"].iloc[j] if full_name in distance_size: if dist < distance_size[full_name]: distance_size[full_name] = dist else: distance_size[full_name] = dist dist_size_sorted = sorted(distance_size.items(), key = lambda kv: kv[1]) return dist_size_sorted[:2]
def closest_hexameter_patterns(self, scansion: str) -> list: """Find the closest group of matching valid hexameter patterns. :return: list of the closest valid hexameter patterns; only candidates with a matching length/number of syllables are considered.""" pattern = scansion.replace(" ", "") pattern = pattern.replace(self.constants.FOOT_SEPARATOR, "") ending = pattern[-1] candidate = pattern[:len(pattern) - 1] + self.constants.OPTIONAL_ENDING cans = [(distance(candidate, x), x) for x in self.VALID_HEXAMETERS if len(x) == len(candidate)] if cans: cans = sorted(cans, key=lambda tup: tup[0]) top = cans[0][0] return [can[1][:-1] + ending for can in cans if can[0] == top] return []
def search(self, word): original = word word = word.lower() candidates = {} if (self.inList(word)): candidates[self.get(word)] = 0 else: edits = self.edits(word) for word in edits.values(): if (self.inList(word) and word[0] == original[0].lower()): word, original = self.get(word).lower(), original.lower() d = distance(word, original) l = len(original) candidates[self.get(word)] = d / l * 100 return candidates
def get_LD(i, j): ''' Calculate sequence distance between a pair of Seq objects ''' # pairwise2 is used to force 'gapless' distance when sequence pair is of the same length if i.junc_len == j.junc_len: identity = pairwise2.align.globalms(i.junc, j.junc, 1, 0, -50, -50, score_only=True, one_alignment_only=True) if type(identity) != float: identity = 0.0 return i.junc_len - identity # Levenshtein distance is used for sequence pairs of different lengths else: return distance(i.junc, j.junc)