Esempio n. 1
0
    def _get_distance(self, a, b, metric='ed', max_len=None):
        """
        Compute the edit distance between two token lists a and b.
        """
        rt = np.nan
        if metric == 'ed':
            lev = Levenshtein()
            rt = lev.distance(a, b)
        elif metric == 'dtw':

            def dist_func(x, y):
                return 0 if x == y else 1

            d, mat_cost, mat_acc_cost, path = dtw_func(a, b, dist=dist_func)
            rt = d
        elif metric == 'lcs':
            lcs = LongestCommonSubsequence()
            rt = lcs.distance(a, b)
        elif metric == 'mylcs':
            lcs = LongestCommonSubsequence()
            rt = lcs.distance(a, b) / max_len
        elif metric == 'mlcs':  # metric LCS
            metric_lcs = MetricLCS()
            rt = metric_lcs.distance(a, b)
        else:
            raise NotImplementedError(
                "Metric not implemented: {}".format(metric))

        return rt
Esempio n. 2
0
def find_best_candidate(ciphertext):
    candidates = []
    for i in range(len(message_candidates)):
        levenshtein = Levenshtein()
        L = 500
        plaintext_str = message_candidates[i]
        distance = (levenshtein.distance(plaintext_str, ciphertext))
        # accuracy = ((1 - (levenshtein.distance(plaintext_str, ciphertext)/L) ) * 100)
        candidate = [i, distance]
        candidates.append(candidate)
    # print("Levenshtein Distance Accuracy", (1 - (levenshtein.distance(plaintext_str, ciphertext) / L)) * 100)
    return candidates
def levenshtein(keyword, domain):
    """Compute Levenshtein distance

    Args:
        keyword:
        domain:

    Returns:
        leven.distance: Levenshtein Distance (int)

    """
    leven = Levenshtein()
    return leven.distance(keyword, domain)
Esempio n. 4
0
def count_matches(t, L, c, freq_replacements):
    substrings = [0] * t
    # Divide the ciphertext into t substrings
    for i in range(t):
        substrings[i] = c[slice(i, L, t)]
    # print(substrings[0])
    # print(substrings)

    # Do frequency analysis on each substring
    updated_substrings = [0] * t
    for i in range(t):
        updated_substrings[i] = frequency_analysis(substrings[i],
                                                   freq_replacements)
    # print(updated_substrings)

    # Now reassemble the substrings into a single message
    new_text = [None] * L
    for i in range(t):
        new_text[i::t] = updated_substrings[i]
    # print(new_text)

    # Convert messages to numbers to compare
    m = []
    for message in message_candidates:
        m.append(convert_to_numbers(message))
    # print(m)
    # print(m[0])

    # Count how many matches we get with each message
    message_distances = []
    levenshtein = Levenshtein()
    for i in range(5):
        distance = (levenshtein.distance(new_text, m[i]))
        # accuracy = ((1 - (levenshtein.distance(plaintext_str, ciphertext)/L) ) * 100)
        message_distances.append(distance)
        # matches_ctr = 0
        # for j in range(500):  # messages are all length 500
        #     if m[i][j] == new_text[j]:
        #         matches_ctr += 1
        # message_matches[i] = matches_ctr
    # print(message_matches)

    # Return the array of message_matches
    return message_distances
def get_edit_distance(a, b):
    """
    Compute the edit distance between a and b.
    """
    lev = Lev()
    return lev.distance(a, b)
Esempio n. 6
0
def query_boosting(search_str):
    """
    Query boosting algorithm
    """

    #Initializing weights
    weights = {
        "title_si": 0,
        "title_en": 0,
        "artist_si": 0,
        "artist_en": 0,
        "music_si": 0,
        "melody_si": 0,
        "lyricist_si": 0,
        "lyrics": 0
    }

    sinhala = isSinhala(search_str)
    num_words = len(search_str.split(" "))

    ####### The algorithm ########
    if (not sinhala):

        weights["artist_en"] = 1
        weights["title_en"] = 1

    elif (num_words < 3):

        weights["artist_si"] = 1

        comp_str = search_str

        artist, music, lyricist, melody = get_all_lists()

        lev = Levenshtein()

        for a in artist:
            dist = lev.distance(a, comp_str)
            if (dist <= 4):
                weights["artist_si"] = 5
                break

        for m in music:
            dist = lev.distance(m, comp_str)
            if (dist <= 4):
                if (weights["artist_si"] == 5):
                    weights["music_si"] = 0
                else:
                    weights["music_si"] = 5

                break

        for l in lyricist:
            dist = lev.distance(l, comp_str)
            if (dist <= 4):
                if (weights["artist_si"] == 5):
                    weights["lyricist_si"] = 0
                else:
                    weights["lyricist_si"] = 5

                break

        for m in melody:
            dist = lev.distance(m, comp_str)
            if (dist <= 4):
                if (weights["artist_si"] == 5):
                    weights["melody_si"] = 0
                else:
                    weights["melody_si"] = 5

                break

    elif (5 > num_words >= 3):

        weights["lyrics"] = 2
        weights["title_si"] = 5

    elif (num_words >= 5):
        weights["lyrics"] = 3

    # Query attributes building based on weights of each field
    title_si = "title_si^{}".format(weights["title_si"])
    title_en = "title_en^{}".format(weights["title_en"])
    artist_si = "artist_si^{}".format(weights["artist_si"])
    artist_en = "artist_en^{}".format(weights["artist_en"])
    music_si = "music_si^{}".format(weights["music_si"])
    melody_si = "melody_si^{}".format(weights["melody_si"])
    lyricist_si = "lyricist_si^{}".format(weights["lyricist_si"])
    lyrics = "lyrics^{}".format(weights["lyrics"])

    return [
        title_si, title_en, artist_si, artist_en, music_si, melody_si,
        lyricist_si, lyrics
    ]
Esempio n. 7
0
fourgram = NGram(4)
print(fourgram.distance(s1, s2))

jarowinkler = JaroWinkler()
print(jarowinkler.similarity('My string', 'My tsring'))
print(jarowinkler.similarity('My string', 'My ntrisg'))

optimal_string_alignment = OptimalStringAlignment()
print(optimal_string_alignment.distance('CA', 'ABC'))

damerau = Damerau()
print(damerau.distance('ABCDEF', 'ABDCEF'))
print(damerau.distance('ABCDEF', 'BACDFE'))
print(damerau.distance('ABCDEF', 'ABCDE'))
print(damerau.distance('ABCDEF', 'BCDEF'))
print(damerau.distance('ABCDEF', 'ABCGDEF'))
print(damerau.distance('ABCDEF', 'POIU'))

normalized_levenshtein = NormalizedLevenshtein()
print(normalized_levenshtein.distance('My string', 'My $string'))
print(normalized_levenshtein.distance('My string', 'My $string'))
print(normalized_levenshtein.distance('My string', 'My $string'))

print(normalized_levenshtein.similarity('My string', 'My $string'))
print(normalized_levenshtein.similarity('My string', 'My $string'))
print(normalized_levenshtein.similarity('My string', 'My $string'))

levenshtein = Levenshtein()
print(levenshtein.distance('My string', 'My $string'))
print(levenshtein.distance('My string', 'My $string'))
print(levenshtein.distance('My string', 'My $string'))