Beispiel #1
0
def get_grades_for_course():
    token = request.args['access_token']
    query = request.args['course']
    status, response = canvas_requests.get(
        token, 'courses', params={'include[]': 'total_scores'})

    if status != 200:
        return response, status

    most_similar_id = 1
    high_score = 0
    for course in response:
        name = course['name']
        code = course['course_code']
        name_sim = jellyfish.levenshtein_distance(name, query)
        code_sim = jellyfish.levenshtein_distance(code, query)
        if max(name_sim, code_sim) > high_score:
            most_similar_id = course['id']

    course = [
        course for course in response if course['id'] == most_similar_id
    ][0]
    enrollment = course['enrollments'][0]

    return {
        'grade': enrollment['computed_current_grade'],
        'score': enrollment['computed_current_score'],
        'course': course['name']
    }
Beispiel #2
0
def levenshtein_ratio(source, target, ignore_case=True):
    """Calculates the levenshtein ratio between two strings.

    The ratio is computed as follows:
        (len(source) + len(target) - distance) / (len(source) + len(target))

    This function has been ported from (MIT license):
        https://github.com/texttheater/golang-levenshtein/blob/4041401c6e7f6a2b49815c4aea652e518ca8e92e/levenshtein/levenshtein.go#L115-L130

    :param str source:
    :param str target:
    :rtype: float
    :return:
    """

    if ignore_case:
        distance = jellyfish.levenshtein_distance(source.lower().strip(),
                                                  target.lower().strip())
    else:
        distance = jellyfish.levenshtein_distance(source, target)

    source_len = len(source)
    target_len = len(target)

    return (source_len + target_len - distance) / (source_len + target_len)
Beispiel #3
0
def distance_filter(df,
                    c,
                    thresh=3,
                    suffix1='_x',
                    suffix2='_y',
                    col1=None,
                    col2=None,
                    nonull=None):
    if (col1 is not None) and (col2 is not None):
        c1 = col1 + suffix1
        c2 = col2 + suffix2
    else:
        c1 = c + suffix1
        c2 = c + suffix2
    if nonull is not None:
        df['distance'] = df.apply(
            lambda x: jf.levenshtein_distance(x[c1], x[c2]), axis=1)
    else:
        df['distance'] = df.apply(lambda x: 10
                                  if (pd.isnull(x[c1]) | pd.isnull(x[c2])) else
                                  jf.levenshtein_distance(x[c1], x[c2]),
                                  axis=1)
    df = df[df.distance <= thresh]

    return df
 def extract(self, x, y):
     if x is None or y is None:
         return 0
     if self.similarity:
         return 1 - float(levenshtein_distance(unicode(x), unicode(y))) / max(len(x), len(y))
     else:
         return levenshtein_distance(unicode(x), unicode(y))
Beispiel #5
0
def token_set_ratio(old_text, new_text):
    old_text_list = re.findall(r"[\w']+", old_text)
    new_text_list = re.findall(r"[\w']+", new_text)

    if len(old_text_list) == 0 or len(new_text_list) == 0:
    	return 0

    old_text_list = sorted(old_text_list)
    new_text_list = sorted(new_text_list)

    common_list = get_intersection(old_text_list, new_text_list)
    old_text_list_diff = get_difference(common_list, old_text_list)
    new_text_list_diff = get_difference(common_list, new_text_list)

    common_list = sorted(common_list)
    old_text_list_diff = sorted(old_text_list_diff)
    new_text_list_diff = sorted(new_text_list_diff)

    old_text_list = common_list+old_text_list_diff
    new_text_list = common_list+new_text_list_diff

    common_text_join = " ".join(str(x) for x in common_list)
    old_text_join = " ".join(str(x) for x in old_text_list)
    new_text_join = " ".join(str(x) for x in new_text_list)

    r1 = 100-jellyfish.levenshtein_distance(common_text_join, old_text_join)/len(old_text_join)*100
    r2 = 100-jellyfish.levenshtein_distance(common_text_join, new_text_join)/len(new_text_join)*100
    r3 = 100-jellyfish.levenshtein_distance(old_text_join, new_text_join)/max(len(old_text_join),len(new_text_join))*100

    result = max(r1, r2, r3)

    return round(result,1)
Beispiel #6
0
def findToken(data, token, max_distance=2):
    result = []

    for j in range(1, max_distance + 1):
        tkl = len(token) + j
        if len(data) >= tkl:
            dl = []
            for i in range(len(data) - tkl):
                distance = jf.levenshtein_distance(data[i:i + tkl], token)
                dl.append(distance)
            for i in range(tkl):
                dl.append(tkl)
            result.append(dl)
        else:
            dl = []
            for i in range(len(data)):
                dl.append(len(token))
            result.append(dl)

    if len(data) >= len(token):
        dl = []
        for i in range(len(data) - len(token)):
            distance = jf.levenshtein_distance(data[i:i + len(token)], token)
            dl.append(distance)
        for i in range(len(token)):
            dl.append(len(token))
        result.append(dl)
    else:
        dl = []
        for i in range(len(data)):
            dl.append(len(token))
        result.append(dl)

    for j in range(1, max_distance + 1):
        tkl = len(token) - j
        if len(data) >= tkl:
            dl = []
            for i in range(len(data) - tkl):
                distance = jf.levenshtein_distance(data[i:i + tkl], token)
                dl.append(distance)
            for i in range(tkl):
                dl.append(tkl)
            result.append(dl)
        else:
            dl = []
            for i in range(len(data)):
                dl.append(len(token))
            result.append(dl)

    if len(result) == 0:
        return
    for dl in result:
        if len(dl) == 0:
            return
    eachResult = np.array(result)
    lowest_i = np.unravel_index(np.argmin(eachResult), eachResult.shape)
    if eachResult[lowest_i[0]][lowest_i[1]] <= max_distance:
        next_i = lowest_i[1] + len(token) + max_distance - lowest_i[0]
        return data[lowest_i[1]:next_i], data[next_i:]
Beispiel #7
0
def get_matrix_distance(words_list, diagonal=True):
    M = [[[] for w in zip(words_list, words_list)] for w in zip(words_list, words_list)] #Generate Square Matrix
    for i in range(len(words_list)):
        for j in range(len(words_list)):
            if diagonal:
                if j>=i:
                    M[i][j] = levenshtein_distance(words_list[i],words_list[j]) #Fill half of it
            else: M[i][j] = levenshtein_distance(words_list[i],words_list[j]) #Fill half of it
    return M
def get_closest_levenshtein(needle,haystack):
	closest = None;
	for x in haystack:
		if(closest == None):
			closest = (x,jellyfish.levenshtein_distance(needle,x));
		else:
			temp = (x,jellyfish.levenshtein_distance(needle,x));
			if(temp[1] < closest[1]):
				closest = temp;
	if(closest == None):
		return None;
	return closest[0];
Beispiel #9
0
def bigram_corr(line): #function with input line(sentence)
    words = line.split() #split line into words
    for idx, (word1, word2) in enumerate(zip(words[:-1], words[1:])):
#     line = list(itertools.chain.from_iterable(line))
        for i,j in fdist: #iterate over bigrams
            if (word2==j) and (jf.levenshtein_distance(word1,i) < 5): #if 2nd words of both match, and 1st word is at an edit distance of 2 or 1, replace word with highest occurring bigram
                idx = 0
                words[idx] = i
            elif (word1==i) and (jf.levenshtein_distance(word2,j) < 5):
                idx = 1
                words[idx] = j
    return " ".join(words)
Beispiel #10
0
def levProDistance(str1, str2):
    c1 = str1.split(" ")
    c2 = str2.split(" ")
    score = 0
    for word in c1:
        levScore = [jf.levenshtein_distance(word , alter) for alter in c2]
        score += min(levScore)
    score2 =0    
    for word in c2:
        levScore = [jf.levenshtein_distance(word , alter) for alter in c1]
        score2 += min(levScore)    
    return ((score2*1.0/len(c2))+(score*1.0/len(c1)))/2
Beispiel #11
0
def get_closest_levenshtein(needle, haystack):
    closest = None
    for x in haystack:
        if (closest == None):
            closest = (x, jellyfish.levenshtein_distance(needle, x))
        else:
            temp = (x, jellyfish.levenshtein_distance(needle, x))
            if (temp[1] < closest[1]):
                closest = temp
    if (closest == None):
        return None
    return closest[0]
 def compare_two_texts(self, string_a, string_b, normalize_value=True):
     """
     Compare two string and return the value of Levenshtein algorithm
     the value is normalized between 0 and 1 values.
     """
     if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or
             (isinstance(string_a, str) and isinstance(string_b, str))):
         if normalize_value:
             return self.__normalized_value(jellyfish.levenshtein_distance(string_a, string_b))
         else:
             return jellyfish.levenshtein_distance(string_a, string_b)
     else:
         raise TypeError
Beispiel #13
0
def jelly():
    import jellyfish
    a = u'Korle Bu Teaching Hospital Sickle Cell Dept'
    b = u'Korle Bu Teaching Hospital'
    # a = u'x'
    # b = u'a'
    print jellyfish.levenshtein_distance(a, b)
    print jellyfish.jaro_distance(a, b)
    print jellyfish.damerau_levenshtein_distance(a, b)
    # print jellyfish.match_rating_comparison(a,b)

    from fuzzywuzzy import fuzz

    print fuzz.ratio(a, b)
Beispiel #14
0
def final(mlf):
    print(mlf)
    l1 = []
    l2 = []
    l3 = []
    l4 = []
    l5 = []
    l6 = []
    sdx_input = call_soundex(mlf)
    sdx_raw = call_soundex(utf_corpus())
    for (i, j), (k, v) in product(sdx_input.items(), sdx_raw.items()):
        l1.append(i.split('-')[0])
        l2.append(j)
        l3.append(k.split('-')[0])
        l4.append(v)
        l5.append(jellyfish.levenshtein_distance(j, v))
        l6.append(
            jellyfish.levenshtein_distance(i.split('-')[0],
                                           k.split('-')[0]))
    df = pd.DataFrame(np.nan,
                      index=range(0, len(l1)),
                      columns=[
                          'wrd',
                          'sx_wrd',
                          'cpr',
                          'sx_cpr',
                          'sx_dist',
                          'lv_dist',
                      ])
    df['wrd'] = l1
    df['sx_wrd'] = l2
    df['cpr'] = l3
    df['sx_cpr'] = l4
    df['sx_dist'] = l5
    df['lv_dist'] = l6
    print(df.head(5))

    min_df_lv = df[df['lv_dist'] <= 2]

    selected = []
    for i in range(0, len(mlf)):
        if len(mlf[i]) > 0:
            x = min_df_lv[min_df_lv['wrd'] == list(mlf)[i]].sort_values(
                by='sx_dist', ascending=False).head(10)
            #print(x)
            s = x.groupby(['cpr'])['wrd'].transform('count')
            selected.append(x['cpr'].ix[s.idxmax()])
            print(x['cpr'].ix[s.idxmax()])
    return selected
Beispiel #15
0
def suggest_normalizations(sample, threshold=1.0):
    """
    Attempts to identify spelling mistakes between two strings (a, b) using the Levenshtein distance metric, which is
    defined as the minimum edit distance between two strings. In order to identify candidates for replacement, we define
    a similarity measure (s) which is defined as such:

    f = jellyfish.levenshtein_distance
    s = floor([len(a)/f(a) + len(b)/f(a)]) / 2

    :param sample: a collection of terms to use
    :param threshold the threshold to use
    :return: a dict of candidates for normalization
    """
    if not all(map(lambda x: type(x) == list, [sample])):
        raise ValueError("normalize() expects scalar-valued arrays as input (e.g. a = [1, 2, 3])")
    else:
        seen = set()
        suggestions = []
        c = collections.Counter(sample).most_common()
        for t1, c1 in c:
            for t2, c2 in c:
                if t1 != t2 and (t1, t2) not in seen:
                    seen.add((t1, t2))
                    seen.add((t2, t1))

                    d = jellyfish.levenshtein_distance(t1, t2)
                    similarity = ((len(t1) / d) + (len(t2) / d)) // 2.0
                    if similarity > threshold:
                        suggestions.append(Suggestion(a=t1, b=t2, edit_distance=d))
        return suggestions
def get_insee(postcode, name):
	"""
	Convert a postcode to an insee code.
	If no exact match, choose best candidate but record it as problematic.
	"""
	global problematicTown
	global problematicPost
	
	if not post2insee.has_key(postcode):
		# No match on postcode...
		problematicPost.add(postcode)
		return None
	elif post2insee[postcode].has_key(name.upper()):
		# Perfect match!
		return post2insee[postcode][name.upper()]
	else:
		# No perfect match, look for best candidate
		best = None
		best_score = None
		for candidate in post2insee[postcode].keys():
			score = jellyfish.levenshtein_distance(name.upper(), candidate)
			if (best_score is None) or (score<best_score):
				best_score = score
				best = candidate
		problematicTown.add( name.upper() )
		if not best is None:
			return post2insee[postcode][best]
		else:
			return None
def get_fuzzy_dict_features(w, s, dict_name=u'fuzzy', distance=5):
    import jellyfish
    to_return = dict()
    for cand in s:
        if jellyfish.levenshtein_distance(w.lower(), cand) < distance:
            to_return[u'wordlist-{}'.format(dict_name)] = 1
        return to_return
def find_card(carddic, s):
  t = { 
  8209: 45, 8211:45, # convert dash
  48: 111, 79: 111, # convert zero and uppercase O to small o
  211: 111, 212: 111, 214: 111, # other chars similar to o
  242: 111, 243: 111, 244: 111, 245: 111, 246: 111, # other chars similar to o
  959:111, 1086:111, 8009:111, 1054:111,    # other chars similar to o
  73:105, 74:105, 106:105, 108:105, 124:105, # convert upper i, upper j, small j, small l and pipe symbol to small i
  161:105, 205:105, 206:105, 236:105, 237:105, 238:105, 239:105, 1575:105,  # convert other chars to i
  192: 65, 193: 65, 194: 65, 196: 65, 1040:65, 1044:65,         # upper A
  200: 69, 201: 69, 202: 69, 1045:69,   # upper E
  85:117,  # convert upper U to small u
  218: 117, 220: 117,  # other conversions to small u
  249: 117, 250: 117, 251: 117, 252: 117, # other conversions to small u
  956: 117, 1094: 117,
  224: 97, 225: 97, 226: 97, 227: 97, 228: 97, 229: 97, # small a conversion
  232: 101, 233: 101, 234: 101, 235: 101 # small e conversion
  }

  d = 999
  dmin = 999
  smin = ""
  for c in carddic:
    d = jellyfish.levenshtein_distance(c.translate(t), s.translate(t))
    if dmin > d:
      dmin = d
      smin = c
      print(c.translate(t) + "/"+ s.translate(t))
  return [carddic[smin], smin, dmin]
Beispiel #19
0
    def find_match_levenshtein(self, token, canonical):
        candidates = []
        best_score = 2
        for word in self.dicts:
            score = jellyfish.levenshtein_distance(
                token,
                word.decode("utf-8").lower())
            if score <= best_score:
                best_score = score
                candidates.append(word.lower())

        #G = ngram.NGram(candidates)
        #best_candidates = G.search(token, threshold=0.5)

        #results = [item[0] for item in best_candidates]

        is_match = False
        for word in candidates:
            if word == canonical:
                is_match = True
                break

        #if len(best_candidates) > 0:
        #    best_match = best_candidates[0][0]
        #else:
        #    best_match = ""

        return candidates, is_match
Beispiel #20
0
def checkID_gorinski(movies):

	movie_db = imdb.IMDb()
	correct = 0
	incorrect = 0
	id_mismatch = []

	for item in movies:
		movie_by_ID = movie_db.get_movie(item[-1])

		if jelly.levenshtein_distance(str(item[0]), str(movie_by_ID)) >= 15:

			# try:
			# 	with open(item[2]) as fp:
			# 		contents = fp.readlines()[:60]
			# 		for line in contents:
			# 			line = line.strip()
			# 			if len(line) <= 15:
			# IMDB search character not working,
			# no cross match with actor (delete)

			# except FileNotFoundError: --> lots of file mismatches
			# 	id_mismatch.append(item)

			id_mismatch.append(item)
			incorrect += 1

		else:
			correct +=1
	return (str(correct/(correct+incorrect)*100), id_mismatch)
Beispiel #21
0
def union_names(anidb_names, absolute_names):
    if not anidb_names and not absolute_names:
        return []
    
    if not anidb_names:
        return absolute_names

    if not absolute_names:
        return anidb_names

    anidb_names_copy = list(anidb_names)
    absolute_names_copy = list(absolute_names)
    name_matches = {}

    while anidb_names_copy:
        anidb_name = anidb_names_copy.pop()

        for name in absolute_names_copy:
            simi = jellyfish.levenshtein_distance(
                anidb_name, name.encode('utf-8'))

            if simi < 5:
                absolute_names_copy.pop(0)
                name_matches[anidb_name] = name
                break

    total_distinct_names = anidb_names
    total_distinct_names.extend([name for name in absolute_names if name not in name_matches.values()])

    return total_distinct_names
def get_levenshtein_agseq():
    '''
    get levenshtein distance per antigen
    :return:
    '''
    infile = 'abdb_outfiles_2019/heavy_light_ag_aaseq.csv'
    df = pd.read_csv(infile).iloc[:]
    print(df.info())
    data = []
    for i, pdbid in enumerate(df.pdbid.unique()):
        pdbdf = df[df.pdbid == pdbid]
        agseq1 = pdbdf.iloc[0].a_sequence
        print('computing %s #%s' % (pdbid, i))
        for pdbid2 in df.pdbid.unique():
            if pdbid2 != pdbid:
                pdbdf2 = df[df.pdbid == pdbid2]
                agseq2 = pdbdf2.iloc[0].a_sequence
                ld = jellyfish.levenshtein_distance(agseq1, agseq2)
                # print(ld)
                datum = [pdbid, pdbid2, agseq1, agseq2, ld]
                data.append(datum)
    colnames = ['pdbid1', 'pdbid2', 'agseq1', 'agseq2', 'ld']
    lddf = pd.DataFrame(data, columns=colnames)
    outname = infile[:-4] + '_antigen_full_ld.csv'
    print(outname)
    lddf.to_csv(outname, index=False)
Beispiel #23
0
def checkID_agarwal(movies):

	movie_db = imdb.IMDb()
	correct = 0
	incorrect = 0
	id_mismatch = []

	for item in movies:
		movie_by_ID = movie_db.get_movie(item[-1])

		# if levenshtein distance test fails for movie title, continue to check
		# for movie year
		if jelly.levenshtein_distance(str(item[0]), str(movie_by_ID)) >= 10:
			year = str(movie_by_ID["year"])
			writer = list(movie_by_ID["writer"])
			writer_to_str = [str(w) for w in writer]
			with open(item[2]) as fp:
				content = fp.readlines()[:20]
				for w in writer_to_str:
					writer_check = any(w in c for c in content)
				match_year = [s for s in content if year in s]
				if match_year == [] and writer_check == False:
					print("Sanity check failed: \n Year or writer mismatch found. \n {} {}".format(item[-1], item[0]), "\n")
					incorrect += 1
					id_mismatch.append(item)
				else:
					print("Sanity check passed: \n {} {}".format(item[-1], item[0]), "\n")
					correct += 1
		else:
			

			print("Sanity check passed: \n {} {}".format(item[-1], item[0]), "\n")
			correct += 1

	return (str(correct/(correct+incorrect)*100), id_mismatch)
Beispiel #24
0
def stringLevensteinFraction(s1, s2, recogHash=False):
    if recogHash:
        s1 = removeHashNSpace(s1)
        s2 = removeHashNSpace(s2)
    s1 = s1.replace(" ", "")
    s2 = s2.replace(" ", "")
    return (1 - jf.levenshtein_distance(s1, s2) / max(len(s1), len(s2)))
Beispiel #25
0
def p2(lines):
    import itertools
    from jellyfish import levenshtein_distance
    for (l1, l2) in itertools.product(lines, repeat=2):
        d = levenshtein_distance(l1, l2)
        if d == 1:
            return common(l1, l2)
def get_levenshtein_epitopeseq():
    '''
    get levenshtein distance per antigen
    :return:
    '''
    infile = 'abdb_outfiles_2019/heavy_light_ag_aaseq.csv'
    df = pd.read_csv(infile).iloc[:]
    print(df.info())
    data = []
    for i, row in df.iterrows():
        pdbid = row.pdbid
        epitopeseq1 = row.epitope
        for i2, row2 in df.iterrows():
            pdbid2 = row2.pdbid
            if pdbid2 != pdbid:
                epitopeseq2 = row2.epitope
                ld = jellyfish.levenshtein_distance(epitopeseq1, epitopeseq2)
                datum = [pdbid, pdbid2, epitopeseq1, epitopeseq2, ld]
                data.append(datum)
    colnames = ['pdbid1', 'pdbid2', 'epitopeseq1', 'epitopeseq2', 'ld']
    lddf = pd.DataFrame(data, columns=colnames)
    print(lddf.head())
    outname = infile[:-4] + '_antigen_epitope_ld.csv'
    print(outname)
    lddf.to_csv(outname, index=False)
def get_levenshtein_segments_epitope():
    '''
    get levenshtein distance per segment
    :return:
    '''
    infile = 'abdb_outfiles_2019/heavy_light_ag_aaseq.csv'
    df = pd.read_csv(infile).iloc[:]
    print(df.info())
    df = df.dropna(subset=['epitope'])
    data = []
    for segment in df.region.unique():
        segdf = df[df.region == segment]
        print(segment)
        print(segdf.shape)
        counter = 0
        for i, row in segdf.iterrows():
            counter += 1
            # print(counter)
            print('seq1 %s' % row.epitope)
            seq1 = row.epitope
            pdbid = row.pdbid
            for i2, row2 in segdf.iterrows():
                pdbid2 = row2.pdbid
                if pdbid != pdbid2:
                    print('seq2 %s' % row2.epitope)
                    seq2 = row2.epitope
                    ld = jellyfish.levenshtein_distance(seq1, seq2)
                    datum = [pdbid, pdbid2, segment, seq1, seq2, ld]
                    data.append(datum)
    colnames = ['pdbid1', 'pdbid2', 'region', 'epitope1', 'epitope2', 'ld']
    lddf = pd.DataFrame(data, columns=colnames)
    print(lddf.head())
    outname = infile[:-4] + '_antigen_epitope_ld.csv'
    print(outname)
    lddf.to_csv(outname, index=False)
def get_levenshtein_segments():
    '''
    get levenshtein distance per segment
    :return:
    '''
    infile = 'abdb_outfiles_2019/abdb_segment_absequence_full_vgene_imgt_vgene.csv'
    df = pd.read_csv(infile)
    print(df.info())
    data = []
    for segment in df.segment.unique():
        segdf = df[df.segment == segment]
        print(segment)
        print(segdf.shape)
        counter = 0
        for i, row in segdf.iterrows():
            counter += 1
            print(counter)
            seq1 = row.segment_seq
            pdbid = row.pdbid
            for i2, row2 in segdf.iterrows():
                pdbid2 = row2.pdbid
                if pdbid != pdbid2:
                    seq2 = row2.segment_seq
                    ld = jellyfish.levenshtein_distance(seq1, seq2)
                    datum = [pdbid, pdbid2, segment, seq1, seq2, ld]
                    data.append(datum)
    colnames = ['pdbid1', 'pdbid2', 'segment', 'seq1', 'seq2', 'ld']
    lddf = pd.DataFrame(data, columns=colnames)
    print(lddf.head())
    outname = infile[:-4] + '_ld.csv'
    print(outname)
    lddf.to_csv(outname, index=False)
Beispiel #29
0
        def levenshtein_apply(pair):
            if _pair_has_any_null(pair):
                LOGGER.debug(
                    "Can't compute Levenshtein distance, "
                    "the pair contains null values: %s",
                    pair,
                )
                return np.nan

            scores = []
            source_list, target_list = pair

            for source in source_list:
                for target in target_list:
                    try:
                        score = 1 - jellyfish.levenshtein_distance(
                            source, target) / np.max(
                                [len(source), len(target)])
                        scores.append(score)
                    except TypeError:
                        if pd.isnull(source) or pd.isnull(target):
                            scores.append(self.missing_value)
                        else:
                            raise

            return max(scores)
Beispiel #30
0
def apply_soundex(misspell, dictionary):
    count = 0
    result = []

    for mis_word in misspell:
        predict_words = []

        if mis_word not in dictionary:
            if '/' not in mis_word:
                for dict_word in dictionary:
                    soundex_mis = jf.soundex(mis_word)
                    soundex_dict = jf.soundex(dict_word)
                    l_dist = jf.levenshtein_distance(soundex_mis, soundex_dict)

                    predict_words.append((dict_word, l_dist))

                first_five_pred = sorted(predict_words,
                                         key=operator.itemgetter(1),
                                         reverse=False)[:5]
                pred_words = [x[0] for x in first_five_pred]

                result.append(pred_words)

            else:
                # do not predict when  word contains '/', a lazy method
                result.append(mis_word)

        # if mis_word in dictionary
        else:
            result.append(mis_word)

        count += 1
        print("Processing: {} / {}".format(count, len(misspell)), end='\r')

    return result
def max_distance(set1, set2):
    if len(set1) == 0 or len(set2) == 0:
        return 0

    return max(1 -
               jellyfish.levenshtein_distance(e1, e2) / max(len(e1), len(e2))
               for e2 in set2 for e1 in set1)
Beispiel #32
0
def getSimilarityRpt(similar, hash, base_tlsh):
    # hash is a sha1, support other main hashes
    rpt = json.loads(similar)
    lengh = len(base_tlsh)
    sim = {
        'sha1_hash': hash,
        'tlsh': base_tlsh,
        'data': [],
    }
    if rpt['query_status'] == "ok":

        print(str(len(rpt['data']) - 1) + " similar files to ")
        print('Base : ' + hash + '  tlsh: ' + base_tlsh)
        for sub in rpt['data']:
            dist = jellyfish.levenshtein_distance(base_tlsh, sub['tlsh'])
            percent = str(round(100 * ((lengh - dist) / lengh), 2)) + "%"
            if sub['sha1_hash'] != hash:
                print('Sha1 : ' + sub['sha1_hash'] + '  tlsh: ' + sub['tlsh'] +
                      ' Similar at : ' + percent + '  tags:  ' +
                      str(sub['tags']))
                data = {}
                data['sha1_hash'] = sub['sha1_hash']
                data['tlsh'] = sub['tlsh']
                data['similar'] = percent
                sim['data'].append(data)
        y = json.dumps(sim, indent=4)
        return y
    else:
        return rpt['query_status']
Beispiel #33
0
def select_busqueda(origen_datos, id_user, palabra_busq):
    unid_select = 15
    y, r, items = importa_tablas_2(origen_datos)
    r0 = r[:, id_user]

    # Crea lista con distancia (Levenshtein )
    distancia = list()
    palabra_busq = palabra_busq.lower()
    for i in range(len(r0)):
        palabra = items.loc[i][0].lower()
        dist_min = np.inf
        for p in palabra.split():
            if palabra_busq in p:
                dist = 0
                if dist < dist_min:
                    dist_min = dist

            dist = jel.levenshtein_distance(palabra_busq, p)
            if dist < dist_min:
                dist_min = dist
        distancia.append(dist_min)

    jugado = 3

    tabla_slc = crea_tabla_slc(distancia, r0, False, jugado)
    seleccion = ejecuta_seleccion(id_user, items, y, r, unid_select, tabla_slc)

    return seleccion
def get_levenshtein_avg(row1, row2):
	sum = 0
	for columnIndex in xrange(1,15):
		a = row1[columnIndex]
		b = row2[columnIndex]
		sum += 1 - jellyfish.levenshtein_distance(a, b) / float(max(len(a), len(b)))
	return sum / 14.0
Beispiel #35
0
    def commission_name_parse(self, string):
        """
        Args:
            string (str): Commission name, such as 'NYB要员护卫'.

        Returns:
            str: Commission genre, such as 'urgent_gem'.
        """
        # if self.is_doa_commission():
        #     return 'doa_daily'
        import jellyfish
        min_key = ''
        min_distance = 100
        string = re.sub(r'[\x00-\x7F]', '', string)
        for key, value in dictionary_jp.items():
            for keyword in value:
                distance = jellyfish.levenshtein_distance(keyword, string)
                if distance < min_distance:
                    min_key = key
                    min_distance = distance
        if min_distance < 3:
            return min_key

        logger.warning(f'Name with unknown genre: {string}')
        self.valid = False
        return ''
    def near_dup_search(self,data,max_dist,content,md5,query,db_conn):
        q=''
        for mh in query:
            q+=str(mh)
        # results = self.s.search(q='*:*',fq='content_sg:\"'+q+'\"')
        results = db_conn[self.db][self.collection].find({'content_sg':q,
            '_id':{'$gt':ObjectId(data['_id'])},
            'catalogue_url':{'$ne':data['catalogue_url']},
            # 'dupl':{'$ne':True}
            })

        matches = defaultdict(list)
        # Just loop over it to access the results.
        for result in results:
            # print("The title is '{0}'.".format(result['content'].encode('utf8')))
            if md5 == result['md5_hash']:
                # matches.append(result)
                matches['Exact'].append(result)
            elif jellyfish.levenshtein_distance(content,
                    result['content']) < max_dist*len(content):
                matches['Approximate'].append(result)

        # if len(matches) > 0:
        #     print('Dups for _id:%s found: ' % data['_id'],end='')
        #     # for match in matches:
        #     print(','.join([str(match['_id']) for match in matches]))
        if all (k in matches for k in ('Exact' and 'Approximate')):
            del matches['Approximate']

        return matches
def alldist(filex, filey):
    xread = open(filex, 'r').read()
    yread = open(filey, 'r').read()
    lvd = jellyfish.levenshtein_distance(xread,yread)
    dlvd= jellyfish.damerau_levenshtein_distance(xread,yread)
    spsum = spamsum.match(xread,yread)
    spsum = 100 - spsum
    spsum = float(spsum/100.00)
#    print lvd
    res = float( lvd / 100.00 )
    dres= float(dlvd / 100.00 )
#    print res
#    print "Levenshtein Distance=",res
    jaro = jellyfish.jaro_distance(xread,yread)
## Added jaro-winkler distance by fahim 20111011
    jarowink = jellyfish.jaro_winkler(xread,yread)
    jaro = 1.0 - jaro
    jarowink = 1.0 - jarowink
#   print "Jaro Distance = ",jaro
    ham = jellyfish.hamming_distance(xread,yread)
    ham = float ( ham / 100.00)
    print "Hamming Distance = ", ham
#	print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2))
#	print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1))
#    print "Spamsum Match score: ", spsum
    kl = kldiv(tokenize(xread), tokenize(yread))

    return res, dres , jaro, jarowink, ham, kl, spsum
Beispiel #38
0
def add_query_features(df, inc, exc, k1list, k2list):
    """
    Return a copy of a dataframe with summary features added for
    the named text files defining the query
    """
    df_new = df.copy()
    k1lens = list(map(len, k1list))
    k2lens = list(map(len, k2list))
    k1max = max(k1lens)
    k2max = max(k2lens)
    k1count = len(k1list)
    k2count = len(k2list)
    df_new['k1_count'] = k1count
    df_new['k2_count'] = k2count
    df_new['k1_max'] = k1max
    df_new['k2_max'] = k2max
    jaro_dist = jellyfish.jaro_distance(inc, exc)
    lev_dist = jellyfish.levenshtein_distance(inc, exc)
    ji = textdistance.jaccard(inc, exc)
    sd = textdistance.sorensen(inc, exc)
    ro = textdistance.ratcliff_obershelp(inc, exc)
    #jellyfish.damerau_levenshtein_distance(inc,exc)
    #jellyfish.jaro_winkler(inc,exc)
    df_new['inc_jaro_exc'] = jaro_dist
    df_new['inc_lev_exc'] = lev_dist
    df_new['inc_ji_exc'] = ji
    df_new['inc_sd_exc'] = sd
    df_new['inc_ro_exc'] = ro
    return df_new
def get_insee(postcode, name, distmax=5):
	"""
	Convert a postcode to an insee code.
	If no exact match, choose best candidate but record it as problematic.
	"""
	global problematicTown
	global problematicPost
	
	# Handle cedex stuff
	if reg_cedex.search(name):
		name = reg_cedex.sub("", name)

	if not post2insee.has_key(postcode):
		# No match on postcode...
		problematicPost.add(postcode)
		return None
	elif post2insee[postcode].has_key(name.upper()):
		# Perfect match!
		return (name.upper(), post2insee[postcode][name.upper()])
	else:
		# No perfect match, look for best candidate
		best = None
		best_score = None
		for candidate in post2insee[postcode].keys():
			score = jellyfish.levenshtein_distance(name.upper(), candidate)
			if (best_score is None) or (score<best_score):
				best_score = score
				best = candidate
		problematicTown.add( name.upper() )
		if (not best is None) and (best_score<distmax):
			return (best, post2insee[postcode][best])
		else:
			return None
    def find_min_dist(lyrics):
        nonlocal min_dist
        nonlocal min_dist_idx
        nonlocal phrase
        nonlocal idx

        # Find best match phrase in lyrics
        min_dist_this_lyrics = 10000
        min_dist_start_idx = 0
        min_dist_end_idx = 0
        lyrics_met = jellyfish.metaphone(lyrics).split(' ')
        for i in range(0, len(lyrics_met) - len(test_met)):
            this_lyrics_met = lyrics_met[i:i + len(test_met)]
            if this_lyrics_met[0] == test_met[0]:
                dist = jellyfish.levenshtein_distance(''.join(test_met), ''.join(this_lyrics_met))
                if dist < min_dist_this_lyrics:
                    min_dist_this_lyrics = dist
                    min_dist_start_idx = i
                    min_dist_end_idx = i + len(test_met)

        # Check against global min
        if min_dist_this_lyrics < min_dist:
            min_dist = min_dist_this_lyrics
            min_dist_idx = idx
            phrase = ' '.join(lyrics.split(' ')[min_dist_start_idx:min_dist_end_idx])

        # Increment global idx
        idx += 1
def alldist(filex, filey):
    xread = open(filex, "r").read()
    yread = open(filey, "r").read()
    lvd = jellyfish.levenshtein_distance(xread, yread)
    dlvd = jellyfish.damerau_levenshtein_distance(xread, yread)

    #    print lvd
    res = float(lvd / 100.00)
    dres = float(dlvd / 100.00)
    #    print res
    # print "Levenshtein Distance=",lv_d
    #    jaro = jellyfish.jaro_distance(xread,yread)
    ## Added jaro-winkler distance by fahim 20111011
    #    jarowink = jellyfish.jaro_winkler(xread,yread)
    #    jaro = 1.0 - jaro
    #    jarowink = 1.0 - jarowink
    # 	print "Jaro Distance = ",jaro
    #    ham = jellyfish.hamming_distance(xread,yread)
    #    ham = float ( ham / 100.00)
    # 	print "Hamming Distance = ", ham
    # 	print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2))
    # 	print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1))
    #    kl = kldiv(tokenize(xread), tokenize(yread))

    return res, dres, jaro, jarowink, ham, kl
Beispiel #42
0
    def levenshtein_similarity(self,s,t):
        """ Levenshtein Similarity """

        Ns = len(s); Nt = len(t);

        lev_sim = 1.0 - (jellyfish.levenshtein_distance(s,t))/float(max(Ns,Nt))

        return lev_sim
def get_avg_word_distance(target_words, predicted_words):
    try:
        trim_target_words = [word.strip() for word in target_words]
        trim_predicted_words = [word.strip() for word in predicted_words]
        dists = [1 - jellyfish.levenshtein_distance(t, p) / max(len(t), len(p)) for t, p in zip(trim_target_words, trim_predicted_words)]
        return sum(dists) / len(dists)
    except ZeroDivisionError:
        return 0
Beispiel #44
0
    def levenshtein_apply(x):

        try:
            return 1 - jellyfish.levenshtein_distance(x[0], x[1]) / np.max([len(x[0]), len(x[1])])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err
	def find_nearest_neighbour(self, e, neighbours):
		minneighbour=""
		mindistance=100000000000.0
		d=0
		for n in neighbours:
            		d=jellyfish.levenshtein_distance(unicode(e),unicode(n))
			if d < mindistance:
				mindistance=d
				minneighbour=n
		return minneighbour,mindistance 
Beispiel #46
0
    def test_levenshtein_distance(self):
        cases = [("", "", 0),
                 ("abc", "", 3),
                 ("bc", "abc", 1),
                 ("kitten", "sitting", 3),
                 ("Saturday", "Sunday", 3),
                 ]

        for (s1, s2, value) in cases:
            self.assertEqual(jellyfish.levenshtein_distance(s1, s2), value)
Beispiel #47
0
def compare_strings(str1, str2):
    """Compares 2 strings with the Levenshtein distance and returns a normalized
    value between 0.0 and 1.0 (meaning totally different and exactly the same
    respectively."""
    if str1 == str2:
        return 1.0
    max_len = max(len(str1), len(str2))
    if max_len == 0:
        return 0.0
    distance = jellyfish.levenshtein_distance(str1, str2)
    return (max_len - distance) / float(max_len)
Beispiel #48
0
def bestcandidate(wrd):
    w = wrd
    candidate_list = []
    try:
        #Check the Brown word clusters
        c = bcluster._word[w]
        for rec in c:
            d = rec['cluster']
        recs = bcluster._cluster[d]
        for rec in recs:
            candidate = rec['word']
            levenshtein = jellyfish.levenshtein_distance(w,candidate)
            n2 = jellyfish.metaphone(w)
            n3 = jellyfish.metaphone(candidate)
            if chant.check(candidate):
                #Filter the candidates within a specific character and phonetic distance
                if levenshtein <= 2 or jellyfish.levenshtein_distance(n2, n3) <= 1:
                    candidate_list.append((candidate, rec['count']))
        return candidate_list[-1][0]
    except Exception:
        return 'No'
Beispiel #49
0
def _string_dist_basic(str1, str2):
    """Basic edit distance between two strings, ignoring
    non-alphanumeric characters and case. Comparisons are based on a
    transliteration/lowering to ASCII characters. Normalized by string
    length.
    """
    str1 = unidecode(str1)
    str2 = unidecode(str2)
    str1 = re.sub(r'[^a-z0-9]', '', str1.lower())
    str2 = re.sub(r'[^a-z0-9]', '', str2.lower())
    if not str1 and not str2:
        return 0.0
    return levenshtein_distance(str1, str2) / float(max(len(str1), len(str2)))
Beispiel #50
0
def find_similar_pws(pw, pw_list, num_passwords):
	match_indices = []
	best_leven_distances = []
	distance = 0
	for i, each in enumerate(pw_list):
		distance = jf.levenshtein_distance(pw, each)
		match_indices.append(i)
		best_leven_distances.append(distance)
	pwd_tuples = sorted(zip(match_indices, best_leven_distances), key=lambda tup: tup[1])
	pwd_tuples = pwd_tuples[2000:100000]
	pwd_tuples = [ pwd_tuples[i] for i in sorted(random.sample(xrange(len(pwd_tuples)), 1000)) ]
	output = lookup_pwds(pwd_tuples, pw_list, num_passwords)
	return output
Beispiel #51
0
	def compute(self,m0,m1,keys = ['DOC_SIM','WIN_SIM','SENT_SIM','OVERLAP']):
		sims = {};
    
		mt_sim = jellyfish.levenshtein_distance(unicode(m0['mention_text']),unicode(m1['mention_text']));
		#return {'MT_SIM': mt_sim};
          #      sims['DOC_SIM'] = self.cos_sim(m0['doc_tf_idf'],m1['doc_tf_idf']);
                sims['WIN_SIM']= self.cos_sim(m0['win_tf_idf'],m1['win_tf_idf']);
            #    sims['SENT_SIM'] = self.cos_sim(m0['sentence_tf_idf'],m1['sentence_tf_idf']);
             #   sims['OVERLAP'] = self.overlap(m0['NER_tags'],m1['NER_tags'],2);
              #  sims['jaccard'] =self.jaccard(m0['win_VEs'],m1['win_VEs'])
             #   sims['overlapVe'] =self.overlap(m0['sentence_VEs'],m1['sentence_VEs'],3)
                sims['win_SIMVe'] =self.cos_sim(self.extractTF_IDF(m0,'win'),self.extractTF_IDF(m1,'win'))
            #    sims['sentence_SIMVe'] =self.cos_sim(self.extractTF_IDF(m0,'sentence'),self.extractTF_IDF(m1,'sentence'))
            #    sims['doc_SIMVe'] =self.cos_sim(self.extractTF_IDF(m0,'doc'),self.extractTF_IDF(m1,'doc'))
		return sims;
Beispiel #52
0
def string_compare(str1, str2, method='JARO'):
    ''' (string, string, string) -> double
    returns the similarity of str1 and str2 according to the method: LEV or JARO
    
    '''

    if method == "LEV":
        # computes Levnenshtein distance which is an integer larger or equal to zero
        # return jellyfish.levenshtein_distance(str1,str2)
        return jellyfish.levenshtein_distance(str1.lower(), str2.lower())

    if method == "JARO":
        # computes Jaro Winkler measure which is always between 0 and 1
        return jellyfish.jaro_distance(str1, str2)

    print("ERROR: Choose the right string similarity measure : LEV or JARO")
    def calculator(aid, pid):
        a_row = authors.get(aid)
        pa_row = paper_authors.get(pid, aid)

        if a_row is None or pa_row is None:
            return np.nan

        if (a_row[Authors.IDX_AFF] == '' or
                pa_row[PaperAuthors.IDX_AFF]) == '':
            return np.nan

        sim = levenshtein_distance(
            unidecode(a_row[Authors.IDX_AFF]).lower(),
            unidecode(pa_row[PaperAuthors.IDX_AFF]).lower()
        )
        return sim
Beispiel #54
0
def distance(string_1, string_2):
    """Compute the edit distance between two strings.
    """
    return jsonify({
        "levenshtein": jellyfish.levenshtein_distance(string_1, string_2),
        "damerau-levenshtein": jellyfish.damerau_levenshtein_distance(
            string_1,
            string_2
        ),
        "jaro": jellyfish.jaro_distance(string_1, string_2),
        "jaro-winkler": jellyfish.jaro_winkler(string_1, string_2),
        "match_rating_codex": jellyfish.match_rating_comparison(
            string_1,
            string_2
        ),
        "sift3": pymailcheck.sift3_distance(string_1, string_2),
    })
def levenshteincmpr(string, list):
    if len(list)==0:
        return False;
    best_lev_match = 999999999;
    fixed_string = strip_name(str(string).lower()).strip()
    for item in list:
        if options['Global']['debug']==1:
            print ".....Literating through {}".format(item) 
        fixed_itemstring = strip_name(str(item).lower()).strip()  
        levdist = levenshtein_distance(fixed_itemstring, fixed_string)
        if options['Global']['debug']==1:
            print "..........file <{}> vs imdb <{}> gave {} levenshtein distance".format(fixed_string, fixed_itemstring, levdist)
        if best_lev_match > levdist:
            best_lev_match = levdist
            best_match = fixed_itemstring
            
    return {'lev':best_lev_match, 'title':best_match}
def test_edit_dist(x):
    s1 = '12012014321231200112211'
    s2 = '1300201231200112211'
    seq1 = [1,2,0,1,2,0,1,4,3,2,1,2,3,1,2,0,0,1,1,2,2,1,1]
    seq2 = [1,3,0,0,2,0,1,2,3,1,2,0,0,1,1,2,2,1,1]
    pos = np.asarray([[0,0],[0,1],   #0 and 1 are nn
                      [2,0],[2,1],   #2 and 3 are nn
                      [4,0],[4,1],   #4 and 5 are nn
                      [6,0],[6,1],   #6 and 7 are nn
                      [8,0],[8,1],   #8 and 9 are nn
                      [9,0],[9,1],   #10 and 11 are nn
                      [10,0],[10,1]],#12 and 13 are nn
                      dtype=float)
                      
    #modify this to ensure it is a non-connected k-nn
    nn = distance.ann(pos,1)[1][:,1:]
    k = 0
    rp = 1
    w = {'M':lambda x:0,'I':lambda x:1,'D':lambda x:1,
         'S':lambda x:2, 'P':lambda x:0.5 }
    a = align.Align(w,rp,nn,k)
    
    u,v = 0,0
    t0 = time.time()
    for i in range(0,int(x)):
        u = jellyfish.levenshtein_distance(s1,s2)
    t1 = time.time()
    t2 = time.time()
    for i in range(0,int(x)):
        v = Levenshtein.editops(s1,s2)
    v = Levenshtein.distance(s1,s2)
    t3 = time.time()
    t4 = time.time()
    for i in range(0,int(x)):
        #v = a.edit_dist(seq1,seq2)
        #w = a.edit_graph(seq1,seq2)
        #w = a.levenshtein(seq1,seq2)
        w = 1
    w = a.edit_dist(seq1,seq2)
    t5 = time.time()
    #w = a.edit_dist(seq1,seq2)
    print('editdist  dist = %s'%v)
    print('seq edit  dist = %s'%w)
    print('editdist  runtime is %s seconds'%(t3-t2))
    print('seq edit  dist = %s'%(t5-t4))
Beispiel #57
0
def diff_string(string1, string2, algorithm="RO"):
    """ deafults to Ratcliff-Obershelp.
    can be changed to Levenshtein algorithm
    1 == same string, 0 == no similarity. The two algorithms
    use a reversed  score scale, I have to rescale."""

    if algorithm == "LE":
        d = jf.levenshtein_distance(string1, string2)
        if d == 0:
            return 1
        else:
            return 1 - float(d)/max(len(string1), len(string2))
    elif algorithm == "RO":
        s = SequenceMatcher(None, string1, string2)
        r = s.ratio()
        return r
    else:
        raise Exception("Wrong algorithm chosen for difference match:"
                        + algorithm)
Beispiel #58
0
    def results(self, query):
        # Look for the query to be a substring of a legislator name
        # (case-insensitive)
        pattern = re.compile(".*%s.*" % query['query'],
                             re.IGNORECASE)

        spec = {'full_name': pattern}

        for prop in query.get('properties', []):
            # Allow filtering by state or chamber for now
            if prop['pid'] in ('state', 'chamber'):
                spec[prop['pid']] = prop['v']

        legislators = db.legislators.find(spec)

        results = []
        for leg in legislators:
            if legislators.count() == 1:
                match = True
                score = 100
            else:
                match = False
                if leg['last_name'] == query['query']:
                    score = 90
                else:
                    distance = levenshtein_distance(leg['full_name'].lower(),
                                                    query['query'].lower())
                    score = 100.0 / (1 + distance)

            # Note: There's a bug in Refine that causes reconciliation
            # scores to be overwritten if the same legislator is returned
            # for multiple queries. see:
            # http://code.google.com/p/google-refine/issues/detail?id=185

            results.append({"id": leg['_id'],
                            "name": leg['full_name'],
                            "score": score,
                            "match": match,
                            "type": [
                                {"id": "/openstates/legislator",
                                 "name": "Legislator"}]})

        return sorted(results, cmp=lambda l, r: cmp(r['score'], l['score']))