Example #1
0
def algorithm(option, dna_seq):
    if (option == 1):
        hamming(dna_seq[0], dna_seq[1])
    if (option == 2):
        levenshtein(dna_seq[0], dna_seq[1])
    if (option == 3):
        smithWaterman(dna_seq[0], dna_seq[1])
    if (option == 4):
        needlemanWusch(dna_seq[0], dna_seq[1])
    if (option == 5):
        blast(dna_seq[0], dna_seq[1])

    return
Example #2
0
def clustering_8_snm(data):

    distance = lambda i, j: levenshtein(DATA[i]['transliteration'], DATA[j]['transliteration'])

    zig_zag = (lambda i: DATA[i]['transliteration'], lambda i: DATA[i]['transliteration'][::-1])

    return sorted_neighborhood(data, radius=1, window=20, distance=distance, keys=zig_zag)
Example #3
0
def MSP(src, word, MSR=default_MSR, distance_weight=default_dw):
    p = 1
    p *= math.pow(distance_weight, levenshtein(src, word, len_weight=1))
    for ch in set(word) - (set(word) & set(src)):
        if ch in MSR.keys():
            p *= MSR[ch]
        else:
            p *= MSR['other']
    return p
Example #4
0
	def check_levenshtein_distance(self, iList, datatype, d_type, split_line):
		MAX_SPELLING_ERRORS = 2

		if levenshtein(d_type.lower(), datatype) <= MAX_SPELLING_ERRORS: 
			if datatype == 'indicators of compromise':
				print('\tno regex match on %s, using Levenshtein distance' % datatype)
				return self.ret_indicators_of_compromise(iList)
			else:
				print('\tno regex match on %s, using Levenshtein distance' % datatype)
				return ''.join(split_line[1:])
		return ''
    def suggest(self) -> List[dict]:
        """
        入力された路線名についてレーベンシュタイン距離が近い駅を返す
        @return: 駅名サジェスト e.g.) [{"station_name": station_name1}, {"station_name": station_name2}...]
        """
        roman_stations = self.df[self.df['line_name'] ==
                                 self.line]['station_name_roman'].to_list()
        stations = self.df[self.df['line_name'] ==
                           self.line]['station_name'].to_list()
        inputed_station_roman = self.romanaize(self.station)[0]
        dists = [
            levenshtein(inputed_station_roman, roman_station)
            for roman_station in roman_stations
        ]
        idx = sorted(range(len(dists)), key=lambda x: dists[x])[:10]

        return [{"station_name": stations[i]} for i in idx]
    def suggest(self) -> list:
        """
        入力された路線名について部分一致路線名かレーベンシュタイン距離が近いものを返す

        @return: 路線名サジェスト e.g.) [{"line_name": line_name1}, {"line_name": line_name2}...]
        """
        roman_lines = list(self.df['line_name_roman'].unique())
        lines = list(self.df['line_name'].unique())
        partial_matches = [line for line in lines if self.line in line]
        if len(partial_matches) > 0:
            return [line for line in partial_matches[:10]]
        inputed_line_roman = self.romanaize(self.line)[0]
        dists = [
            levenshtein(inputed_line_roman, roman_line)
            for roman_line in roman_lines
        ]
        idx = sorted(range(len(dists)), key=lambda x: dists[x])[:10]
        return [lines[i] for i in idx]
Example #7
0
def dist(r1, r2, method='hamming'):
    """ Levenshtein/Wasserstein type distance between two ranked ballots.

    Between 0 and 1+

    Args:
        method: 'hamming', 'levenshtein', 'kendall', 'winner', 'euclidean', 'winner_mistake', 'winner_distance', 'asymmetrical_winner_distance'.
    """
    # https://math.stackexchange.com/questions/2492954/distance-between-two-permutations
    # https://people.revoledu.com/kardi/tutorial/Similarity/OrdinalVariables.html
    # L1 norm between permutation matrices (does it work with ties?)
    # Normalized Rank Transformation
    # Footrule distance
    # Damareau-Levenshtein - transposition distance
    # Cayley distance - Kendall but with any pairs
    # Ulam / LCS distance - number of delete-shift-insert operations (no ties)
    # Chebyshev /maximum distance
    # Minkowski distance
    # Jaro-Winkler distance - only transpositions
    if method == 'hamming':  # Hamming distance: number of differences
        d = hamming(r1, r2)
    elif method == 'levenshtein':  # Levenshtein distance - deletion, insertion, substitution
        d = levenshtein(arr_to_str(r1), arr_to_str(r2))
    elif method in ['kendall',
                    'kendalltau']:  # Absolute Kendall distance, defined below
        d = kendall_tau_distance(r1, r2)
    elif method == 'winner':  # How much the ranked first in r1 is far from the first place in r2
        i = np.argmin(r1)
        d = r2[i] - r1[i]  # TODO: should be an absolute value?
    elif method == 'euclidean':
        if not isinstance(r1, np.ndarray):
            r1, r2 = np.array(r1), np.array(r2)
        d = np.linalg.norm(r1 - r2)
    elif method == 'winner_mistake':  # 0 if the winner is the same (TODO: ties?)
        d = 1
        if np.argmin(r1) == np.argmin(r2):
            d = 0
    elif method == 'winner_distance':
        d = winner_distance(r1, r2)
    elif method == 'symmetrical_winner_distance':
        d = symmetrical_winner_distance(r1, r2)
    else:
        raise (Exception('Unknown distance method: {}'.format(method)))
    return d
Example #8
0
 def leven_fit(self, word, area=None):
     answer = ''
     MIN = 20
     head, tail = 0, len(self.word_list) - 1
     if area != None:
         head, tail = area
     # for w in lexicon:
     for w in self.word_list[head:tail]:
         d = levenshtein(word,
                         w,
                         insert_costs=self.ins_map,
                         delete_costs=self.del_map,
                         substitute_costs=self.sub_map)
         if d < MIN:
             MIN = d
             answer = w
             if d == 0:
                 break
     return answer
Example #9
0
def BankDistance(b1, b2):
	if(b1 == b2):
		return 0;
		
	p1 = b1.split("/");
	p2 = b2.split("/");
	
	l1 = p1[-1];
	l2 = p2[-1];
	
	if(l1 == "Drum & SFX" and l2 == "User DK"):
		d = 1;
	elif(l1 == "Drum & SFX" and l2 == "User"):
		d = 1000;
	elif(l2 == "User"):
		d = 1;
	else:
		d = levenshtein(l1, l2);
	
	return BankPrefixMatch(p1[0:-1], p2[0:-1]) + d;
def spell(word, count=10, dict_words=None):
    dict_words = load_words() if dict_words is None else dict_words
    return sorted(dict_words, key=lambda dw: levenshtein(word, dw))[:count]
Example #11
0
def EntryDistance(e1, e2):
	d = BankDistance(e1.bank, e2.bank);
	d += levenshtein(entry.soundName, targetEntry.soundName);
	
	return d;
Example #12
0
File: query.py Project: opt9/hoply
candidates = set()

LIMIT = int(sys.argv[3])

start = 0
with WiredTiger(path) as storage:
    start = time.time()
    with h.transaction(storage) as tr:
        for count in range(len(query), 0, -1):
            prefix = query[0:count]
            prefix = pack((prefix,))
            # strip the very last \x00 byte
            prefix = prefix[0 : len(prefix) - 1]
            for key, _ in tr.prefix(prefix):
                concept, = unpack(key)
                candidates.add(concept)
            if len(candidates) > (LIMIT * 10):
                break

concepts = sorted(candidates, key=lambda x: levenshtein(x, query))
concepts = concepts[:LIMIT]

end = time.time()

for concept in concepts:
    print(concept)


print("\n\nTime spent: ", end - start)
Example #13
0
def align(l1, l2, c2):
    """Compute the optimal alignment between two list of words
    à la Needleman-Wunsch.

    The function returns a (score, alignment) pair. An alignment is simply
    a list of list of size len(l1) giving for each word in l1, the list of
    indices in l2 it maps to (the list is empty if the word maps to nothing).

    Note that if the list is of size>1, the word in l1 will map to a sequence
    of words in l2. Conversly, consecutive words in l1 can map to
    the same word in l2.
    """

    # Throughout the function, l1 is to be thought of as the proofread text,
    # and l2 as the OCR text. The deletion costs are not symmetric: removing
    # junk from the OCR is frequent while removing a word from the proofread
    # text should be rare.
    del_cost1 = 50
    def del_cost2(w):
        return 1+3*len([c for c in w if c.isalnum()])
    w = 3 # multiplicative cost factor for the Levenshtein distance

    n, m = len(l1), len(l2)
    # a is the (score, alignment) matrix. a[i][j] is the (score, alignment)
    # pair of the first i words of l1 to the first j words of l2
    a = [[(0, [])] * (m + 1) for i in xrange(n + 1)]

    for j in xrange(1, m + 1):
        a[0][j] = j, []

    for i in xrange(1, n + 1):
        a[i][0] = i * del_cost1, [[]] * i

        for j in xrange(1, m + 1):

            s, b = a[i-1][j-1]
            d = levenshtein(l1[i-1], l2[j-1])
            min_s, min_b  = s + w * d, b + [[j-1]]

            s, b = a[i-1][j]
            if s + del_cost1 < min_s:
                min_s, min_b = s + del_cost1, b + [[]]

            s, b = a[i][j-1]
            if s + del_cost2(l2[j-1]) < min_s:
                min_s, min_b = s + del_cost2(l2[j-1]), b

            for k in xrange(1, 8):
                for l in xrange(1, 5):
                    if k + l <= 2:
                        continue
                    if k+l > 7:
                        break
                    if j < l or i < k:
                        break
                    s, b = a[i-k][j-l]
                    d = levenshtein(join_words(l1[i-k:i]),
                                    join_ocr_words(l2[j-l:j], c2[j-l:j]))
                    if s + w * d < min_s:
                        temp = [[j-1]] if l == 1 else [range(j-l, j)]
                        min_s, min_b = s + w * d, b + temp * k

            a[i][j] = min_s, min_b

    return a[n][m]
Example #14
0
 def aux(i):
     leftw, rightw = word[:i] + "-", word[i:]
     return (leftw, rightw,
             levenshtein(leftw, left) + levenshtein(rightw, right))
Example #15
0
#     for person in cluster:
#         writer.writerow({'lang': person['lang'], 'name': person['name']})

# print('Found %i relevant clusters' % RELEVANT_CLUSTERS)
# of.close()

# In[ ]:

of = open(OUTPUT, 'w')
writer = csv.DictWriter(of, fieldnames=['lang', 'name'])
writer.writeheader()

for p in PERSONS:
    p['skeleton_key'] = skeleton_key(p['name'])

distance = lambda a, b: levenshtein(PERSONS[a]['name'], PERSONS[b]['name'])


def key(i):
    p = PERSONS[i]

    return (p['birth'] or 0, p['death'] or 0, p['skeleton_key'])


clusters = list(
    sorted_neighborhood(range(len(PERSONS)),
                        distance=distance,
                        window=50,
                        radius=2,
                        key=key))