コード例 #1
0
ファイル: wordlist_gen.py プロジェクト: mbarkhau/sbk
def load_distances(wordlist):
    cache_path = "/tmp/word_distances_cache.json"
    try:
        with io.open(cache_path, mode="r", encoding="utf-8") as fobj:
            distances = json.loads(fobj.read())
    except:
        distances = {}

    word_frequencies = read_word_frequencies()

    for w1 in wordlist:
        for w2 in wordlist:
            keya = w1 + ":" + w2
            keyb = w2 + ":" + w1
            if w1 == w2:
                distances[keya] = 0
                distances[keyb] = 0
            elif keya not in distances:
                d = pylev.damerau_levenshtein(w1, w2)
                distances[keya] = d
                distances[keyb] = d

    with io.open(cache_path, mode="w", encoding="utf-8") as fobj:
        fobj.write(json.dumps(distances))

    return distances
コード例 #2
0
def similarity(a, b):
    ratio = fuzz.ratio(a, b)
    dist = damerau_levenshtein(a, b)
    diff_len = abs(len(a) - len(b))
    len_penalty = log(len(a) / (1 + diff_len))
    # penalty = 0.5 + 1/log(1 + dist)

    score = ratio
    return Munch(locals())
コード例 #3
0
ファイル: parse_command.py プロジェクト: gitmonox/corona
 def get_command(self, command):
     score = 99999999
     best = ""
     for c in self.command_list:
         s = damerau_levenshtein(command, c)
         if s < score:
             best = c
             score = s
     return best
コード例 #4
0
 def get_data(self, country_name):
     if self.time < time():
         rc, payload = _load_data()
         if rc:
             self.cache = payload
         self.time = time() + self.ttl
     if self.cache is None:
         return None
     c = country_name.lower()
     if c in self.cache:
         return self.cache[c]
     best = ""
     score = 999999999
     for name in self.cache.keys():
         s = damerau_levenshtein(name, c)
         if s < score:
             score = s
             best = name
     if best in self.cache:
         return self.cache[best]
     return None
コード例 #5
0
def resolve_entity(raw_string, bias=None):
    args = raw_string if not bias else [raw_string, bias]
    result = search(args)
    candidates = []
    if bias and bias in result and result[bias]:
        for rank, name, coords in result[bias]:
            c1 = coords
            break
        assert c1, "Location not found: {bias}"

    if raw_string in result and result[raw_string]:
        for v in result[raw_string]:
            rank, name, coords = v
            lat = coords[0] if coords else None
            lon = coords[1] if coords else None
            dist = math.inf if (not coords or not bias) else geodesic(
                c1, coords)
            edit_dist = damerau_levenshtein(raw_string, name)
            ratio = fuzz.ratio(raw_string, name)

            candidates.append(
                Munch({
                    "query": raw_string,
                    "bias": bias,
                    "result": name,
                    "rank": rank,
                    "geodist": dist,
                    "editdist": edit_dist,
                    "ratio": ratio,
                    "coords": coords,
                    "lat": lat,
                    "lon": lon,
                    "has_coords": bool(coords),
                }))

    _sorted = sorted(candidates,
                     key=lambda result:
                     (result.geodist, result.editdist, result.ratio))
    return _sorted
コード例 #6
0
 def dist_fn(wl_word: str) -> int:
     dist = pylev.damerau_levenshtein(word, wl_word)
     assert isinstance(dist, int)
     return dist
コード例 #7
0
import pylev

while True:
    string1 = input("Enter String 1: ")
    string2 = input("Enter String 2: ")

    distance_dl = pylev.damerau_levenshtein(string1, string2)

    print("Damerau Levenshtein Distance ", distance_dl)

    distance_cl = pylev.classic_levenshtein(string1, string2)

    print("Classic Levenshtein Distance ", distance_cl)

    distance_re = pylev.recursive_levenshtein(string1, string2)

    print("Recursive Levenshtein Distance: ", distance_re)
コード例 #8
0
ファイル: tests.py プロジェクト: ixtel/pylev
 def test_damerau_levenshtein(seld):
     assert pylev.damerau_levenshtein("ba", "abc") == 2
     assert pylev.damerau_levenshtein("foobar", "foobra") == 1
     assert pylev.damerau_levenshtein("fee", "deed") == 2
コード例 #9
0
def resolve_entity(raw_string, bias=None):
    args = [raw_string] if not bias else [raw_string, bias]
    result = search(args)
    candidates = []
    if bias and bias in result and result[bias]:
        for v in result[bias]:
            c1 = v.coords
            break
        assert c1, f"Location not found: {bias}"

    if raw_string in result and result[raw_string]:
        for v in result[raw_string]:
            dist = (math.inf if (not v.coords or not bias) else geodesic(
                c1, v.coords).kilometers)
            edit_dist = damerau_levenshtein(raw_string.lower(),
                                            v.result.lower())
            ratio = fuzz.token_sort_ratio(raw_string, v.result)
            corefs = set()
            _ks = re.findall(r"([\w\s\-]{3,})", raw_string)
            for _k in _ks:
                for cat in v.categories:
                    if re.search(_k, cat, re.IGNORECASE):
                        corefs.add(cat)
            corefs = list(corefs)

            if ratio > 68 or edit_dist <= 10:
                try:
                    if dist is math.inf:
                        d = pow(2, 64)
                    else:
                        d = dist
                    score = log(1 + d, 1 + len(v.categories)) * -1
                except Exception as e:
                    print(e.__class__.__name__, e, dist, len(v.categories))
                    score = math.inf
            else:
                score = ratio
                score *= len(corefs) / 2 * -1
            if edit_dist <= 4:
                score = -math.inf

            index = (dist, edit_dist, ratio)
            candidates.append(
                Munch({
                    "query": raw_string,
                    "bias": bias,
                    "result": v.result,
                    "rank": v.rank,
                    "geodist": dist,
                    "editdist": edit_dist,
                    "corefs": list(corefs),
                    "ratio": ratio,
                    "coords": v.coords,
                    "index": index,
                    "score": score,
                    "lat": v.lat,
                    "lon": v.lon,
                    "categories": v.categories,
                    "has_coords": bool(v.coords),
                }))

    _sorted = sorted(candidates, key=lambda result: result.index)
    return _sorted
コード例 #10
0
ファイル: test_mnemonic.py プロジェクト: mbarkhau/sbk
def test_wordlist_distances():
    for w1, w2 in itertools.product(WORDLIST, WORDLIST):
        if w1 != w2:
            d = pylev.damerau_levenshtein(w1, w2)
            assert d >= 3, (w1, w2)
コード例 #11
0
 def test_damerau_levenshtein(seld):
     assert pylev.damerau_levenshtein("ba", "abc") == 2
     assert pylev.damerau_levenshtein("foobar", "foobra") == 1
     assert pylev.damerau_levenshtein("fee", "deed") == 2
コード例 #12
0
ファイル: chrome-profile.py プロジェクト: Silentsoul04/env
        lineNumber = match["lineNumber"]
        print("{0}:{1}:{2}".format(url, lineNumber, functionName))

        #
        # Retrieve snippet
        #

        if not printSnippets:
            continue

        candidateParts = url.split('/')
        candidateName = candidateParts[len(candidateParts) - 1]
        if candidateName not in paths:
            continue

        candidatePaths = paths[candidateName]
        bestDistance = 9999
        bestPath = ''
        for candidatePath in candidatePaths:
            distance = pylev.damerau_levenshtein(candidatePath, url)
            if (distance < bestDistance):
                bestDistance = distance
                bestPath = candidatePath
        if not bestPath:
            continue

        with open(bestPath, "rb") as f2:
            lines = f2.readlines()
            snippet = lines[lineNumber:lineNumber + 5]
            print(highlight_snippet(''.join(snippet)))