Python levenshtein Examples, Levenshtein.levenshtein Python Examples

Example #1

0

Show file

File: main.py Project: arielsilveira/Bioinformatica

def algorithm(option, dna_seq):
    if (option == 1):
        hamming(dna_seq[0], dna_seq[1])
    if (option == 2):
        levenshtein(dna_seq[0], dna_seq[1])
    if (option == 3):
        smithWaterman(dna_seq[0], dna_seq[1])
    if (option == 4):
        needlemanWusch(dna_seq[0], dna_seq[1])
    if (option == 5):
        blast(dna_seq[0], dna_seq[1])

    return

Example #2

0

Show file

def clustering_8_snm(data):

    distance = lambda i, j: levenshtein(DATA[i]['transliteration'], DATA[j]['transliteration'])

    zig_zag = (lambda i: DATA[i]['transliteration'], lambda i: DATA[i]['transliteration'][::-1])

    return sorted_neighborhood(data, radius=1, window=20, distance=distance, keys=zig_zag)

Example #3

0

Show file

def MSP(src, word, MSR=default_MSR, distance_weight=default_dw):
    p = 1
    p *= math.pow(distance_weight, levenshtein(src, word, len_weight=1))
    for ch in set(word) - (set(word) & set(src)):
        if ch in MSR.keys():
            p *= MSR[ch]
        else:
            p *= MSR['other']
    return p

Example #4

0

Show file

	def check_levenshtein_distance(self, iList, datatype, d_type, split_line):
		MAX_SPELLING_ERRORS = 2

		if levenshtein(d_type.lower(), datatype) <= MAX_SPELLING_ERRORS: 
			if datatype == 'indicators of compromise':
				print('\tno regex match on %s, using Levenshtein distance' % datatype)
				return self.ret_indicators_of_compromise(iList)
			else:
				print('\tno regex match on %s, using Levenshtein distance' % datatype)
				return ''.join(split_line[1:])
		return ''

Example #5

0

Show file

File: suggest_station.py Project: diagonal-m/sf-station-api

    def suggest(self) -> List[dict]:
        """
        入力された路線名についてレーベンシュタイン距離が近い駅を返す
        @return: 駅名サジェスト e.g.) [{"station_name": station_name1}, {"station_name": station_name2}...]
        """
        roman_stations = self.df[self.df['line_name'] ==
                                 self.line]['station_name_roman'].to_list()
        stations = self.df[self.df['line_name'] ==
                           self.line]['station_name'].to_list()
        inputed_station_roman = self.romanaize(self.station)[0]
        dists = [
            levenshtein(inputed_station_roman, roman_station)
            for roman_station in roman_stations
        ]
        idx = sorted(range(len(dists)), key=lambda x: dists[x])[:10]

        return [{"station_name": stations[i]} for i in idx]

Example #6

0

Show file

File: suggest_station.py Project: diagonal-m/sf-station-api

    def suggest(self) -> list:
        """
        入力された路線名について部分一致路線名かレーベンシュタイン距離が近いものを返す

        @return: 路線名サジェスト e.g.) [{"line_name": line_name1}, {"line_name": line_name2}...]
        """
        roman_lines = list(self.df['line_name_roman'].unique())
        lines = list(self.df['line_name'].unique())
        partial_matches = [line for line in lines if self.line in line]
        if len(partial_matches) > 0:
            return [line for line in partial_matches[:10]]
        inputed_line_roman = self.romanaize(self.line)[0]
        dists = [
            levenshtein(inputed_line_roman, roman_line)
            for roman_line in roman_lines
        ]
        idx = sorted(range(len(dists)), key=lambda x: dists[x])[:10]
        return [lines[i] for i in idx]

Example #7

0

Show file

def dist(r1, r2, method='hamming'):
    """ Levenshtein/Wasserstein type distance between two ranked ballots.

    Between 0 and 1+

    Args:
        method: 'hamming', 'levenshtein', 'kendall', 'winner', 'euclidean', 'winner_mistake', 'winner_distance', 'asymmetrical_winner_distance'.
    """
    # https://math.stackexchange.com/questions/2492954/distance-between-two-permutations
    # https://people.revoledu.com/kardi/tutorial/Similarity/OrdinalVariables.html
    # L1 norm between permutation matrices (does it work with ties?)
    # Normalized Rank Transformation
    # Footrule distance
    # Damareau-Levenshtein - transposition distance
    # Cayley distance - Kendall but with any pairs
    # Ulam / LCS distance - number of delete-shift-insert operations (no ties)
    # Chebyshev /maximum distance
    # Minkowski distance
    # Jaro-Winkler distance - only transpositions
    if method == 'hamming':  # Hamming distance: number of differences
        d = hamming(r1, r2)
    elif method == 'levenshtein':  # Levenshtein distance - deletion, insertion, substitution
        d = levenshtein(arr_to_str(r1), arr_to_str(r2))
    elif method in ['kendall',
                    'kendalltau']:  # Absolute Kendall distance, defined below
        d = kendall_tau_distance(r1, r2)
    elif method == 'winner':  # How much the ranked first in r1 is far from the first place in r2
        i = np.argmin(r1)
        d = r2[i] - r1[i]  # TODO: should be an absolute value?
    elif method == 'euclidean':
        if not isinstance(r1, np.ndarray):
            r1, r2 = np.array(r1), np.array(r2)
        d = np.linalg.norm(r1 - r2)
    elif method == 'winner_mistake':  # 0 if the winner is the same (TODO: ties?)
        d = 1
        if np.argmin(r1) == np.argmin(r2):
            d = 0
    elif method == 'winner_distance':
        d = winner_distance(r1, r2)
    elif method == 'symmetrical_winner_distance':
        d = symmetrical_winner_distance(r1, r2)
    else:
        raise (Exception('Unknown distance method: {}'.format(method)))
    return d

Example #8

0

Show file

 def leven_fit(self, word, area=None):
     answer = ''
     MIN = 20
     head, tail = 0, len(self.word_list) - 1
     if area != None:
         head, tail = area
     # for w in lexicon:
     for w in self.word_list[head:tail]:
         d = levenshtein(word,
                         w,
                         insert_costs=self.ins_map,
                         delete_costs=self.del_map,
                         substitute_costs=self.sub_map)
         if d < MIN:
             MIN = d
             answer = w
             if d == 0:
                 break
     return answer

Example #9

0

Show file

File: autosoundmapping.py Project: aczwink/KORG-Tools

def BankDistance(b1, b2):
	if(b1 == b2):
		return 0;
		
	p1 = b1.split("/");
	p2 = b2.split("/");
	
	l1 = p1[-1];
	l2 = p2[-1];
	
	if(l1 == "Drum & SFX" and l2 == "User DK"):
		d = 1;
	elif(l1 == "Drum & SFX" and l2 == "User"):
		d = 1000;
	elif(l2 == "User"):
		d = 1;
	else:
		d = levenshtein(l1, l2);
	
	return BankPrefixMatch(p1[0:-1], p2[0:-1]) + d;

Example #10

0

Show file

File: spell.py Project: salman-codes/optimizing_python_code

def spell(word, count=10, dict_words=None):
    dict_words = load_words() if dict_words is None else dict_words
    return sorted(dict_words, key=lambda dw: levenshtein(word, dw))[:count]

Example #11

0

Show file

File: autosoundmapping.py Project: aczwink/KORG-Tools

def EntryDistance(e1, e2):
	d = BankDistance(e1.bank, e2.bank);
	d += levenshtein(entry.soundName, targetEntry.soundName);
	
	return d;

Example #12

0

Show file

File: query.py Project: opt9/hoply

candidates = set()

LIMIT = int(sys.argv[3])

start = 0
with WiredTiger(path) as storage:
    start = time.time()
    with h.transaction(storage) as tr:
        for count in range(len(query), 0, -1):
            prefix = query[0:count]
            prefix = pack((prefix,))
            # strip the very last \x00 byte
            prefix = prefix[0 : len(prefix) - 1]
            for key, _ in tr.prefix(prefix):
                concept, = unpack(key)
                candidates.add(concept)
            if len(candidates) > (LIMIT * 10):
                break

concepts = sorted(candidates, key=lambda x: levenshtein(x, query))
concepts = concepts[:LIMIT]

end = time.time()

for concept in concepts:
    print(concept)


print("\n\nTime spent: ", end - start)

Example #13

0

Show file

File: string_utils.py Project: wikisource/ocr-tools

def align(l1, l2, c2):
    """Compute the optimal alignment between two list of words
    à la Needleman-Wunsch.

    The function returns a (score, alignment) pair. An alignment is simply
    a list of list of size len(l1) giving for each word in l1, the list of
    indices in l2 it maps to (the list is empty if the word maps to nothing).

    Note that if the list is of size>1, the word in l1 will map to a sequence
    of words in l2. Conversly, consecutive words in l1 can map to
    the same word in l2.
    """

    # Throughout the function, l1 is to be thought of as the proofread text,
    # and l2 as the OCR text. The deletion costs are not symmetric: removing
    # junk from the OCR is frequent while removing a word from the proofread
    # text should be rare.
    del_cost1 = 50
    def del_cost2(w):
        return 1+3*len([c for c in w if c.isalnum()])
    w = 3 # multiplicative cost factor for the Levenshtein distance

    n, m = len(l1), len(l2)
    # a is the (score, alignment) matrix. a[i][j] is the (score, alignment)
    # pair of the first i words of l1 to the first j words of l2
    a = [[(0, [])] * (m + 1) for i in xrange(n + 1)]

    for j in xrange(1, m + 1):
        a[0][j] = j, []

    for i in xrange(1, n + 1):
        a[i][0] = i * del_cost1, [[]] * i

        for j in xrange(1, m + 1):

            s, b = a[i-1][j-1]
            d = levenshtein(l1[i-1], l2[j-1])
            min_s, min_b  = s + w * d, b + [[j-1]]

            s, b = a[i-1][j]
            if s + del_cost1 < min_s:
                min_s, min_b = s + del_cost1, b + [[]]

            s, b = a[i][j-1]
            if s + del_cost2(l2[j-1]) < min_s:
                min_s, min_b = s + del_cost2(l2[j-1]), b

            for k in xrange(1, 8):
                for l in xrange(1, 5):
                    if k + l <= 2:
                        continue
                    if k+l > 7:
                        break
                    if j < l or i < k:
                        break
                    s, b = a[i-k][j-l]
                    d = levenshtein(join_words(l1[i-k:i]),
                                    join_ocr_words(l2[j-l:j], c2[j-l:j]))
                    if s + w * d < min_s:
                        temp = [[j-1]] if l == 1 else [range(j-l, j)]
                        min_s, min_b = s + w * d, b + temp * k

            a[i][j] = min_s, min_b

    return a[n][m]

Example #14

0

Show file

File: string_utils.py Project: wikisource/ocr-tools

 def aux(i):
     leftw, rightw = word[:i] + "-", word[i:]
     return (leftw, rightw,
             levenshtein(leftw, left) + levenshtein(rightw, right))

Example #15

0

Show file

#     for person in cluster:
#         writer.writerow({'lang': person['lang'], 'name': person['name']})

# print('Found %i relevant clusters' % RELEVANT_CLUSTERS)
# of.close()

# In[ ]:

of = open(OUTPUT, 'w')
writer = csv.DictWriter(of, fieldnames=['lang', 'name'])
writer.writeheader()

for p in PERSONS:
    p['skeleton_key'] = skeleton_key(p['name'])

distance = lambda a, b: levenshtein(PERSONS[a]['name'], PERSONS[b]['name'])


def key(i):
    p = PERSONS[i]

    return (p['birth'] or 0, p['death'] or 0, p['skeleton_key'])


clusters = list(
    sorted_neighborhood(range(len(PERSONS)),
                        distance=distance,
                        window=50,
                        radius=2,
                        key=key))