def load_distances(wordlist): cache_path = "/tmp/word_distances_cache.json" try: with io.open(cache_path, mode="r", encoding="utf-8") as fobj: distances = json.loads(fobj.read()) except: distances = {} word_frequencies = read_word_frequencies() for w1 in wordlist: for w2 in wordlist: keya = w1 + ":" + w2 keyb = w2 + ":" + w1 if w1 == w2: distances[keya] = 0 distances[keyb] = 0 elif keya not in distances: d = pylev.damerau_levenshtein(w1, w2) distances[keya] = d distances[keyb] = d with io.open(cache_path, mode="w", encoding="utf-8") as fobj: fobj.write(json.dumps(distances)) return distances
def similarity(a, b): ratio = fuzz.ratio(a, b) dist = damerau_levenshtein(a, b) diff_len = abs(len(a) - len(b)) len_penalty = log(len(a) / (1 + diff_len)) # penalty = 0.5 + 1/log(1 + dist) score = ratio return Munch(locals())
def get_command(self, command): score = 99999999 best = "" for c in self.command_list: s = damerau_levenshtein(command, c) if s < score: best = c score = s return best
def get_data(self, country_name): if self.time < time(): rc, payload = _load_data() if rc: self.cache = payload self.time = time() + self.ttl if self.cache is None: return None c = country_name.lower() if c in self.cache: return self.cache[c] best = "" score = 999999999 for name in self.cache.keys(): s = damerau_levenshtein(name, c) if s < score: score = s best = name if best in self.cache: return self.cache[best] return None
def resolve_entity(raw_string, bias=None): args = raw_string if not bias else [raw_string, bias] result = search(args) candidates = [] if bias and bias in result and result[bias]: for rank, name, coords in result[bias]: c1 = coords break assert c1, "Location not found: {bias}" if raw_string in result and result[raw_string]: for v in result[raw_string]: rank, name, coords = v lat = coords[0] if coords else None lon = coords[1] if coords else None dist = math.inf if (not coords or not bias) else geodesic( c1, coords) edit_dist = damerau_levenshtein(raw_string, name) ratio = fuzz.ratio(raw_string, name) candidates.append( Munch({ "query": raw_string, "bias": bias, "result": name, "rank": rank, "geodist": dist, "editdist": edit_dist, "ratio": ratio, "coords": coords, "lat": lat, "lon": lon, "has_coords": bool(coords), })) _sorted = sorted(candidates, key=lambda result: (result.geodist, result.editdist, result.ratio)) return _sorted
def dist_fn(wl_word: str) -> int: dist = pylev.damerau_levenshtein(word, wl_word) assert isinstance(dist, int) return dist
import pylev while True: string1 = input("Enter String 1: ") string2 = input("Enter String 2: ") distance_dl = pylev.damerau_levenshtein(string1, string2) print("Damerau Levenshtein Distance ", distance_dl) distance_cl = pylev.classic_levenshtein(string1, string2) print("Classic Levenshtein Distance ", distance_cl) distance_re = pylev.recursive_levenshtein(string1, string2) print("Recursive Levenshtein Distance: ", distance_re)
def test_damerau_levenshtein(seld): assert pylev.damerau_levenshtein("ba", "abc") == 2 assert pylev.damerau_levenshtein("foobar", "foobra") == 1 assert pylev.damerau_levenshtein("fee", "deed") == 2
def resolve_entity(raw_string, bias=None): args = [raw_string] if not bias else [raw_string, bias] result = search(args) candidates = [] if bias and bias in result and result[bias]: for v in result[bias]: c1 = v.coords break assert c1, f"Location not found: {bias}" if raw_string in result and result[raw_string]: for v in result[raw_string]: dist = (math.inf if (not v.coords or not bias) else geodesic( c1, v.coords).kilometers) edit_dist = damerau_levenshtein(raw_string.lower(), v.result.lower()) ratio = fuzz.token_sort_ratio(raw_string, v.result) corefs = set() _ks = re.findall(r"([\w\s\-]{3,})", raw_string) for _k in _ks: for cat in v.categories: if re.search(_k, cat, re.IGNORECASE): corefs.add(cat) corefs = list(corefs) if ratio > 68 or edit_dist <= 10: try: if dist is math.inf: d = pow(2, 64) else: d = dist score = log(1 + d, 1 + len(v.categories)) * -1 except Exception as e: print(e.__class__.__name__, e, dist, len(v.categories)) score = math.inf else: score = ratio score *= len(corefs) / 2 * -1 if edit_dist <= 4: score = -math.inf index = (dist, edit_dist, ratio) candidates.append( Munch({ "query": raw_string, "bias": bias, "result": v.result, "rank": v.rank, "geodist": dist, "editdist": edit_dist, "corefs": list(corefs), "ratio": ratio, "coords": v.coords, "index": index, "score": score, "lat": v.lat, "lon": v.lon, "categories": v.categories, "has_coords": bool(v.coords), })) _sorted = sorted(candidates, key=lambda result: result.index) return _sorted
def test_wordlist_distances(): for w1, w2 in itertools.product(WORDLIST, WORDLIST): if w1 != w2: d = pylev.damerau_levenshtein(w1, w2) assert d >= 3, (w1, w2)
lineNumber = match["lineNumber"] print("{0}:{1}:{2}".format(url, lineNumber, functionName)) # # Retrieve snippet # if not printSnippets: continue candidateParts = url.split('/') candidateName = candidateParts[len(candidateParts) - 1] if candidateName not in paths: continue candidatePaths = paths[candidateName] bestDistance = 9999 bestPath = '' for candidatePath in candidatePaths: distance = pylev.damerau_levenshtein(candidatePath, url) if (distance < bestDistance): bestDistance = distance bestPath = candidatePath if not bestPath: continue with open(bestPath, "rb") as f2: lines = f2.readlines() snippet = lines[lineNumber:lineNumber + 5] print(highlight_snippet(''.join(snippet)))