Ejemplo n.º 1
0
def calNameScore(profile1, profile2):
	name1 = profile1["name"]
	name2 = profile2["name"]
	lang1 = profile1["nameLang"]
	lang2 = profile2["nameLang"]
	if lang1 == lang2:
		if lang1 in langs_not_western:
			return calNotWesternName(name1, name2)
		else:
			# return name_tools.match(name1, name2)
			return calNotWesternName(name1, name2)
	else:
		if len(name1)>3 and len(name2)>3:
			# 避免有時候翻譯錯,所以不用name_tools 例如chen shih ying 陳世穎卻翻成chen shiying
			try:
				if lang1 != "en":
					name1 = str(TextBlob(name1).translate(to="en"))
			except:
				pass
			try:
				if lang2 != "en":
					name2 = str(TextBlob(name2).translate(to="en"))
			except:
				pass
			return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2)
		else:
			return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2)
Ejemplo n.º 2
0
def calNameScore(profile1, profile2):
    name1 = profile1["name"]
    name2 = profile2["name"]
    lang1 = profile1["nameLang"]
    lang2 = profile2["nameLang"]
    if lang1 == lang2:
        if lang1 in langs_not_western:
            return calNotWesternName(name1, name2)
        else:
            # return name_tools.match(name1, name2)
            return calNotWesternName(name1, name2)
    else:
        if len(name1) > 3 and len(name2) > 3:
            # 避免有時候翻譯錯,所以不用name_tools 例如chen shih ying 陳世穎卻翻成chen shiying
            try:
                if lang1 != "en":
                    name1 = str(TextBlob(name1).translate(to="en"))
            except:
                pass
            try:
                if lang2 != "en":
                    name2 = str(TextBlob(name2).translate(to="en"))
            except:
                pass
            return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(
                name1, name2)
        else:
            return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(
                name1, name2)
Ejemplo n.º 3
0
def flag_na_n_ad(InputKamus, InputFTS):

    InputKamus = str(InputKamus)
    InputFTS = str(InputFTS)
    digits_input = ' '.join(re.findall(r'\d+', InputKamus))
    digits_fts = ' '.join(re.findall(r'\d+', InputFTS))
    no_int_kamus = ' '.join([
        x for x in InputKamus.split()
        if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())
    ])
    no_int_fts = ' '.join([
        x for x in InputFTS.split()
        if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())
    ])
    # try:
    if digits_input or digits_fts:
        nilai_digits = normalized_damerau_levenshtein_distance(
            digits_fts, digits_input)
        # print "number", nilai_digits
        # print 'DIGITS', nilai_digits
        if nilai_digits >= 0.5:
            return 'CONFIDENT'
        elif nilai_digits <= 0.4:
            return 'CONFIDENT'

    if no_int_kamus or no_int_fts:
        nilai_kalimat = normalized_damerau_levenshtein_distance(
            no_int_kamus, no_int_fts)
        # print "word", nilai_kalimat
        # print 'KATA', nilai_kalimat
        if nilai_kalimat >= 0.5:
            return 'CONFIDENT'
        elif nilai_kalimat <= 0.4:
            return 'CONFIDENT'
Ejemplo n.º 4
0
def get_similarities(Features, url_input):
    """
    similarity metrics include: Levenshtein, jaro, damerau levenshtein, normalized_damerau_levenshtein,
    and hamming distance
    :param Features: input dictionary to add things to
    :param url_input
    :return: Features: after adding all similarity metrics
    """
    for n in itertools.chain(product_domain_names, brand_names):
        Features['url_levenshtein_distance_' + n] = Levenshtein.distance(
            url_input, n)
        Features['fqdn_levenshtein_distance_' + n] = Levenshtein.distance(
            Features['fqdn'], n)
        Features['url_jaro_winkler_distance_' + n] = jw.get_jaro_distance(
            url_input, n)
        Features['fqdn_jaro_winkler_distance_' + n] = jw.get_jaro_distance(
            Features['fqdn'], n)
        Features['url_damerau_levenshtein_distance_' +
                 n] = dl.damerau_levenshtein_distance(url_input, n)
        Features['fqdn_damerau_levenshtein_distance_' +
                 n] = dl.damerau_levenshtein_distance(Features['fqdn'], n)
        Features['url_damerau_levenshtein_normalized_distance_' +
                 n] = dl.normalized_damerau_levenshtein_distance(url_input, n)
        Features['fqdn_damerau_levenshtein_normalized_distance_' +
                 n] = dl.normalized_damerau_levenshtein_distance(
                     Features['fqdn'], n)
        if len(n) == len(url_input):
            Features['url_length_equals_' + n] = 1
            Features['url_hamming_distance_' + n] = hamming(url_input, n)
            Features['fqdn_hamming_distance_' + n] = hamming(
                Features['fqdn'], n)
        else:
            Features['url_length_equals_' + n] = 0
    return Features
Ejemplo n.º 5
0
def get_str_similarity(source, target, label):
    try:
        source = source.lower()
        target = target.lower()
        long_sub = combine_google_fsq_for_gt.longest_substring(source, target)
        similarities = {
            label + "rosimilarity":
            combine_google_fsq_for_gt.get_ro_similarity(source, target),
            label + "dlevensimilarity":
            1 - normalized_damerau_levenshtein_distance(source, target),
            label + "levensimilarity":
            Levenshtein.ratio(source, target),
            label + "phoneticsimilarity":
            combine_google_fsq_for_gt.get_levenshtein_phonetic_similarity(
                source, target),
        }
        if (len(long_sub)):
            similarities[label +
                         "lenlongsubstring"] = len(long_sub) / len(source)
        else:
            similarities[label + "lenlongsubstring"] = 0
    except:
        similarities = {
            label + "rosimilarity": None,
            label + "dlevensimilarity": None,
            label + "levensimilarity": None,
            label + "phoneticsimilarity": None,
            label + "lenlongsubstring": None,
        }
    res = {k: v for k, v in similarities.items()}
    return res
Ejemplo n.º 6
0
 def rank(self, src, tgt):
     ''' Returns the rank of the source and target paths. '''
     p = len(set(tgt) - set(src))
     a = normalized_damerau_levenshtein_distance(str(src), str(tgt)) + p
     b = max(len(src), len(tgt)) + p
     candidateScore = 1 - (a / b)
     return candidateScore
Ejemplo n.º 7
0
 def rank(self, src, tgt):
     ''' Returns the rank of the source and target paths. '''
     p = len(set(tgt) - set(src))
     a = normalized_damerau_levenshtein_distance(unicode(src), unicode(tgt)) + p
     b = max(len(src), len(tgt)) + p
     candidateScore = 1 - (a/b)
     return candidateScore
Ejemplo n.º 8
0
def norm_dld(l1, l2):
    ascii_start = 0
    # make a string for l1
    # all triples are unique...
    s1 = ''.join((chr(ascii_start+i) for i in range(len(l1))))
    s1_upd = list(s1)
    for i in range(len(l1)):
        for j in range(i+1, len(l1)):
            if trip_match(l1[i], l1[j]):
                s1_upd[j] = s1[i]
    s1_upd = ''.join(s1_upd)
    s2 = ''
    next_char = ascii_start + len(s1)
    for j in range(len(l2)):
        found = None
        #next_char = chr(ascii_start+len(s1)+j)
        for k in range(len(l1)):
            if trip_match(l2[j], l1[k]):
                found = s1_upd[k]
                #next_char = s1[k]
                break
        if found is None:
            s2 += chr(next_char)
            next_char += 1
            #assert next_char <= 128
        else:
            s2 += found
    # return 1- , since this thing gives 0 to perfect matches etc
    return 1.0-normalized_damerau_levenshtein_distance(s1_upd, s2)
def compare_DL(filename1, filename2):
    # Sanity check
    if not os.path.isfile(filename1):
        print('\nERROR: First source file ' + filename1 + ' was not found.\n')
        return (-3)
    if not os.path.isfile(filename2):
        print('\nERROR: Second source file ' + filename2 + ' was not found.\n')
        return (-4)

    # Read the content of the first file
    text1 = ""
    f1 = None
    with open(filename1) as f1:
        lines1 = [line.rstrip('\n') for line in f1]
    for line in lines1:
        text1 += line + ' '
    text1 = text1[:-1]

    # Read the content of the second file
    text2 = ""
    f2 = None
    with open(filename2) as f2:
        lines2 = [line.rstrip('\n') for line in f2]
    for line in lines2:
        text2 += line + ' '
    text2 = text2[:-1]

    sim = 1.0 - normalized_damerau_levenshtein_distance(text1, text2)
    return (sim)
def winner(v_):
    n = v_.size
    sim = [0.0] * n

    if n == 1: return 0, 1.0

    i = 0
    while (i < n):
        j = i + 1
        while (j < n):
            s = 1.0 - normalized_damerau_levenshtein_distance(v_[i], v_[j])
            if s == 1.0:  # Two identical values
                return i, 1.0
            sim[i] = sim[i] + s
            sim[j] = sim[j] + s
            j = j + 1
        i = i + 1

    # Search maximum and save the index to return it
    sim_max = sim[0]
    i_max = 0
    i = 1
    while (i < n):
        if (sim_max < sim[i]):
            i_max = i
            sim_max = sim[i]
        i = i + 1

    return i_max, (sim_max / (n - 1))
Ejemplo n.º 11
0
def winner(v_):
    n = len(v_)
    sim = [0.0] * n

    if n == 1: return -1, 1.0

    i = 0
    while (i < n):
        j = i + 1
        while (j < n):
            if v_[i] == v_[j]:
                return (i, 1.0)
            s = 1.0 - normalized_damerau_levenshtein_distance(v_[i], v_[j])
            sim[i] = sim[i] + s
            sim[j] = sim[j] + s
            j = j + 1
        i = i + 1

    # Search maximum and return the index
    sim_max = sim[0]
    i_max = 0
    i = 1
    while (i < n):
        if (sim_max < sim[i]):
            i_max = i
            sim_max = sim[i]
        i = i + 1

    return i_max, (sim_max / (n - 1))
Ejemplo n.º 12
0
 def calc(self, search_word: str):
     word1_string = mojimoji.han_to_zen(search_word.lower())
     word2_string = mojimoji.han_to_zen(self.keyword.lower())
     self.distance = Decimal("1.0") - Decimal(
         str(
             normalized_damerau_levenshtein_distance(
                 word1_string, word2_string)))
     return self
Ejemplo n.º 13
0
def calDisplayNameScore(profile1, profile2):
	name1 = profile1["displayName"]
	name2 = profile2["displayName"]
	lang1 = profile1["displayNameLang"]
	lang2 = profile2["displayNameLang"]
	if lang1 == lang2:
		return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2)
	else:
		if len(name1)>3 and len(name2)>3:
			try:
				if lang1 != "en":
					name1 = str(TextBlob(name1).translate(to="en"))
				if lang2 != "en":
					name2 = str(TextBlob(name2).translate(to="en"))
			except:
				pass
			return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2)
		else:
			return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2)
Ejemplo n.º 14
0
def is_similar(source: str, target: str, threshold: float) -> bool:
    """
    使用現成模組,來源
    https://github.com/gfairchild/pyxDamerauLevenshtein
    Args:
        source: 使用者的查詢字串
        target: 資料庫的字串
        threshold: 一個介於0到1之間的值,表示可以容許兩個字串相異程度的最大值
    """
    distance = normalized_damerau_levenshtein_distance(source, target)
    return distance < threshold
Ejemplo n.º 15
0
def suffix_measure(x, y, x2, res):
    for k in range(len(y)):
        b = y[k]
        h = b != 1
        b = b[h].clone()
        a = x2[:, x.shape[1]:][k]
        a = a[h].clone()
        a_res = a.cpu().numpy()
        b_res = b.cpu().numpy()

        res.append(
            1 -
            normalized_damerau_levenshtein_distance(list(a_res), list(b_res)))
Ejemplo n.º 16
0
def calDisplayNameScore(profile1, profile2):
    name1 = profile1["displayName"]
    name2 = profile2["displayName"]
    lang1 = profile1["displayNameLang"]
    lang2 = profile2["displayNameLang"]
    if lang1 == lang2:
        return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(
            name1, name2)
    else:
        if len(name1) > 3 and len(name2) > 3:
            try:
                if lang1 != "en":
                    name1 = str(TextBlob(name1).translate(to="en"))
                if lang2 != "en":
                    name2 = str(TextBlob(name2).translate(to="en"))
            except:
                pass
            return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(
                name1, name2)
        else:
            return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(
                name1, name2)
Ejemplo n.º 17
0
 def test_normalized_damerau_levenshtein_distance(self):
     assert normalized_damerau_levenshtein_distance(
         'smtih', 'smith') == 0.20000000298023224
     assert normalized_damerau_levenshtein_distance(
         'snapple', 'apple') == 0.2857142984867096
     assert normalized_damerau_levenshtein_distance(
         'testing', 'testtn') == 0.2857142984867096
     assert normalized_damerau_levenshtein_distance('saturday',
                                                    'sunday') == 0.375
     assert normalized_damerau_levenshtein_distance('Saturday',
                                                    'saturday') == 0.125
     assert normalized_damerau_levenshtein_distance('orange',
                                                    'pumpkin') == 1.0
     assert normalized_damerau_levenshtein_distance(
         'gifts', 'profit') == 0.8333333134651184
     assert normalized_damerau_levenshtein_distance('Sjöstedt',
                                                    'Sjostedt') == 0.125
Ejemplo n.º 18
0
    def calculatePathDistance(self, pathA, pathB):
        semA, semB = self.createSemesterWords(pathA, pathB)
        for i, s1 in enumerate(semA):
            distRow = np.array([])
            for j, path2 in enumerate(self.paths):
                if j <= i:
                    distRow = np.append(distRow, [0])
                else:
                    distRow = np.append(distRow, [self.calculatePathDistance(path, path2)])
            if self.distanceMatrix is None:
                self.distanceMatrix = np.array([distRow])
            else:
                self.distanceMatrix = np.vstack((self.distanceMatrix, distRow))

        distance = normalized_damerau_levenshtein_distance(strSemesterA, strSemesterB)
        return distance
Ejemplo n.º 19
0
def norm_dld(l1, l2):
    ascii_start = 0
    assert len(l1) + len(l2) <= 128
    # make a string for l1
    # all triples are unique...
    s1 = ''.join((chr(ascii_start+i) for i in xrange(len(l1))))
    s2 = ''
    for j in xrange(len(l2)):
        next_char = chr(ascii_start+len(s1)+j)
        for k in xrange(len(l1)):
            if trip_match(l2[j], l1[k]):
                next_char = s1[k]
                break
        s2 += next_char
    # return 1- , since this thing gives 0 to perfect matches etc
    return 1.0-normalized_damerau_levenshtein_distance(s1, s2)
Ejemplo n.º 20
0
def string_similarity_ratio(s1, s2):
    """
    A string compare function, using the Redcliff-Obershelp algorithm. For
    further details see: http://docs.python.org/3.3/library/difflib.html
    TODO: Levenshtein might be better for this purpose.

    :params s1, s2: Two input strings which will be compared
    :returns: A ratio between 0.0 (not similar at all) and 1.0 (probably the
    same string).

    """
    if s1 and s2:
        return 1 - normalized_damerau_levenshtein_distance(
            _clean_movie_title(s1),
            _clean_movie_title(s2)
        )
Ejemplo n.º 21
0
def generate_reward(gold_summary, summary, gold_cp, cp, reward_type=1):
    #Bleu score
    # bleu = corpus_bleu([gold_summary],summary)

    cp = list(deepcopy(cp))
    # DLD
    if gold_cp:
        dld = normalized_damerau_levenshtein_distance(list(gold_cp), list(cp))
    else:
        dld = 0.
    boolean = np.zeros(len(cp))
    for pos, element in enumerate(cp):
        if element in gold_cp:
            boolean[pos] = 1
    precision = np.mean(boolean)
    recall = np.sum(boolean) / len(gold_cp)
    return (precision + recall + (1 - dld)) / 3
Ejemplo n.º 22
0
    def compute_distances(reference, values, exp=None):
        if exp is None:
            exp = Config.SIMILARITY_EXPONENT

        sm = difflib.SequenceMatcher()
        sm.set_seq2(reference)
        sm_distances = []
        dl_distances = []
        for val in values:
            sm.set_seq1(val)
            sm_distances.append(sm.ratio())
            dl_distances.append(
                1 - normalized_damerau_levenshtein_distance(reference, val))
        sm_distances = np.array(sm_distances)
        dl_distances = np.array(dl_distances)
        dl_exp = np.power(dl_distances, exp)
        sm_exp = np.power(sm_distances, exp)
        dist_sum = dl_exp + sm_exp
        return np.power(dist_sum, 1 / exp)
Ejemplo n.º 23
0
    def calc_distance_domains(self):
        not_in_top = []
        output = []
        for item in self.data:
            if item not in self.top_domains:
                not_in_top.append(item)

        for item in not_in_top:
            entry = {item: {}}
            flag = False
            for td in self.top_domains:
                dist = normalized_damerau_levenshtein_distance(item, td)
                if 0 < dist < 0.2:
                    entry[item][td] = dist
                    flag = True
            if flag is True:
                output.append(entry)
        if len(output) > 0:
            return output
        else:
            return None
Ejemplo n.º 24
0
def processtxn(txn, choices):
    maxscoreJ = 0
    matchstrJ = ""
    maxscoreDL = 0
    matchstrDL = ""
    maxscoreNDL = 0
    matchstrNDL = ""
    for c in choices:
        scoreJ = jaro.jaro_metric(txn, c)
        scoreDL = 1000 - damerau_levenshtein_distance(txn, c)
        scoreNDL = 1 - normalized_damerau_levenshtein_distance(txn, c)
        if scoreJ > maxscoreJ:
            matchstrJ = c
            maxscoreJ = scoreJ
        if scoreDL > maxscoreDL:
            matchstrDL = c
            maxscoreDL = scoreDL
        if scoreNDL > maxscoreNDL:
            matchstrNDL = c
            maxscoreNDL = scoreNDL
    return {'jaro': matchstrJ, 'dl': matchstrDL, 'ndl': matchstrNDL}
Ejemplo n.º 25
0
def get_likely_cfg_array_version(glycan_list, distance_threshold=2.0):
    '''Get the most likely CFG glycan array given a list of glycans.

    Uses a scaled Levenshtein distance to compute similarity between glycan strings,
    and returns the array with the minimum sum of scaled levenshtein distances
    for each pair of glycans in the glycan list and corresponding reference array.

    We need to do this because sometimes the array version is not provided, and 
    there are slight spelling errors in the provided glycan names. It is easier to
    match to a reference list of glycans for a particular array version, with all 
    errors corrected.

    Args:
        glycan_list (list): A list of glycan strings ordered by index.
        distance_threshold (float): A threshold for total scaled Levenshtein distance for calling a match.
    Returns:
        CFG glycan list (list), most likely array version (string), number of mismatches (int), scaled Levenshtein distance (float)
    '''
    glycan_list = list(glycan_list)
    for i, glycan in enumerate(glycan_list):
        # Handle odd characters in some excel files. Nonbreaking spaces, greek letters etc.
        glycan_list[i] = glycan.replace('–', '-').replace('α', 'a') \
                            .replace('β', 'b').replace('[', '(') \
                            .replace(']', ')').replace(' ', '').replace(u"\u00A0", '')
    likely_array = None
    likely_array_mismatches = None
    scaled_levenshtein_total = 0
    for key, value in cfg_array_versions.items():
        # Take into account glycans which are almost the same.
        array_version = [x.replace(' ', '') for x in value[1]]
        scaled_levenshtein_sum = np.sum([normalized_damerau_levenshtein_distance(x, y) for x, y in zip_longest(glycan_list, array_version, fillvalue='')])
        non_matches = len([x for x in zip(glycan_list, array_version) if x[0] != x[1]])
        #if not likely_array or non_matches < likely_array_mismatches:
        if not likely_array or scaled_levenshtein_sum < scaled_levenshtein_total:
            likely_array = key
            likely_array_mismatches = non_matches
            scaled_levenshtein_total = scaled_levenshtein_sum
    if scaled_levenshtein_total > distance_threshold:
        raise ValueError("Glycan list does not match to known array versions.")
    return list(cfg_array_versions[likely_array][1]), likely_array, likely_array_mismatches, scaled_levenshtein_total
def get_str_similarity(source, target):
    source = source.lower()
    target = target.lower()
    long_sub = longest_substring(source, target)
    #"long_substring": longest_substring(source, target),
    similarities = {
        "ro_similarity":
        get_ro_similarity(source, target),
        "dleven_similarity":
        1 - normalized_damerau_levenshtein_distance(source, target),
        "leven_similarity":
        Levenshtein.ratio(source, target),
        "phonetic_similarity":
        get_levenshtein_phonetic_similarity(source, target),
    }
    if (len(long_sub)):
        similarities["len_long_substring"] = len(long_sub) / len(source)
    else:
        similarities["len_long_substring"] = 0
    res = {k: v for k, v in similarities.items() if v is not None}
    avg_sim = sum(res.values()) / len(res.values())
    return avg_sim
 def calculatePathDistance(self, pathA, pathB):
     courseNames = abstract.extractAllCourseNames([pathA, pathB])
     idDict = dict()
     for i, n in enumerate(courseNames):
         idDict[n] = i
     semesterA, semesterB = [], []
     for sem in pathA.semester:
         tempArr = []
         for c in sem:
             tempArr.append(chr(idDict[c.name]))
         tempArr.sort()
         semesterA.append(''.join(tempArr))
     for sem in pathB.semester:
         tempArr = []
         for c in sem:
             tempArr.append(chr(idDict[c.name]))
         tempArr.sort()
         semesterB.append(''.join(tempArr))
     strSemesterA = ''.join(semesterA)
     strSemesterB = ''.join(semesterB)
     distance = normalized_damerau_levenshtein_distance(
         strSemesterA, strSemesterB)
     return distance
Ejemplo n.º 28
0
    def getCandidateBaseFeature(self, candidate, num_candidates, max_prior):
        # base feature_num
        features = []
        m_label = candidate.getMentionText()
        # number of candidates
        features.append(num_candidates)
        # max_prior
        features.append(max_prior)

        # string similarity features
        if self._has_str_sim:
            c_label = candidate.label
            if self._lowercase: c_label = c_label.lower()
            # edit_distance
            features.append(
                normalized_damerau_levenshtein_distance(c_label, m_label))
            # is equal
            features.append(1 if c_label == m_label else 0)
            # mlabel contains clabel
            features.append(1 if c_label in m_label else 0)
            # clabel contains mlabel
            features.append(1 if m_label in c_label else 0)
            # mlabel starts with clabel
            features.append(1 if m_label.startswith(c_label) else 0)
            # clabel starts with mlabel
            features.append(1 if c_label.startswith(m_label) else 0)
            # mlabel ends with clabel
            features.append(1 if m_label.endswith(c_label) else 0)
            # clabel ends with mlabel
            features.append(1 if c_label.endswith(m_label) else 0)

        # prior
        if self._has_prior:
            # entity prior
            features.append(candidate.getEntityMentionPrior())

        return features
Ejemplo n.º 29
0
            for j in range(FLAGS.batch_size):
                try:
                    eocIndex = h[j].tolist().index(endOfCaseId) + 1
                except ValueError, e:
                    eocIndex = None
                suffixes_predicted[j] = h[j].tolist()[:eocIndex]
            suffixes_predicted_alpha = [
                ''.join([num2alpha[element] for element in suffix])
                for suffix in suffixes_predicted
            ]
            suffixes_alpha = [
                ''.join([num2alpha[element] for element in suffix])
                for suffix in suffixes[0]
            ]
            for j in range(FLAGS.batch_size):
                outputFile.write("{0}\n".format(suffixes_predicted_alpha[j]))
                outputFile.write("{0}\n".format(suffixes_alpha[j]))
                distance = dl.normalized_damerau_levenshtein_distance(
                    suffixes_predicted_alpha[j], suffixes_alpha[j])
                outputFile.write("{0}\n".format(distance))
                sum_distance += distance
            outputFile.flush()
            os.fsync(outputFile.fileno())
            print("Batch {} of {} ".format(batchNum, numBatches))

        outputFile.write("average edit_distance: {0}\n".format(
            sum_distance / (FLAGS.batch_size * numBatches)))
        outputFile.close()

    resultFile.close()
Ejemplo n.º 30
0
import requests
import json
import pyxdameraulevenshtein

matches = json.loads(requests.get('http://worldcup.kimonolabs.com/api/matches?sort=startTime&fields=homeScore,awayScore,startTime,awayTeamId,homeTeamId,id&apikey=72519cb45986ce5ffd15020a5e4b1a70').content)
print matches
gabriel_teams = list(set('Brazil,Mexico,Spain,Chile,Colombia,Cote Divoire,Uruguay,England,Switzerland,France,Argentina,Iran,Germany,Ghana,Belgium,Russia,Brazil,Cameroon,Spain,Australia,Colombia,Japan,Uruguay,Italy,Switzerland,Honduras,Argentina,Nigeria,Germany,USA,Belgium,Korea Republic,Cameroon,Croatia,Australia,Netherlands,Japan,Greece,Italy,Costa Rica,Honduras,Ecuador,Nigeria,Bosnia-Herzegovina,USA,Portugal,Korea Republic,Algeria'.split(',')))
teams = json.loads(requests.get('http://worldcup.kimonolabs.com/api/teams?apikey=72519cb45986ce5ffd15020a5e4b1a70').content)
names = list(set([team['name'].encode('utf-8') for team in teams]))
print names
matches = [teams[min([(i,pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(gteam, team)) for i, team in enumerate(names)], key = lambda x: x[1])[0]]['id'] for gteam in gabriel_teams]
for a,b in zip(gabriel_teams, matches):
	try:
		print a,unicode(b)
	except:
		pass
Ejemplo n.º 31
0
 def extract(self, source, paraphrase, position):
     # Levenshtein distance
     return normalized_damerau_levenshtein_distance(tokenize(source),
                                                    tokenize(paraphrase))
Ejemplo n.º 32
0
 def test_normalized_damerau_levenshtein_distance(self):
     assert normalized_damerau_levenshtein_distance('smtih', 'smith') == 0.20000000298023224
     assert normalized_damerau_levenshtein_distance('', '') == 0
     assert normalized_damerau_levenshtein_distance('snapple', 'apple') == 0.2857142984867096
     assert normalized_damerau_levenshtein_distance('testing', 'testtn') == 0.2857142984867096
     assert normalized_damerau_levenshtein_distance('saturday', 'sunday') == 0.375
     assert normalized_damerau_levenshtein_distance('Saturday', 'saturday') == 0.125
     assert normalized_damerau_levenshtein_distance('orange', 'pumpkin') == 1.0
     assert normalized_damerau_levenshtein_distance('gifts', 'profit') == 0.8333333134651184
     assert normalized_damerau_levenshtein_distance('Sjöstedt', 'Sjostedt') == 0.125
     assert np.isclose(normalized_damerau_levenshtein_distance([1, 2, 3], [1, 3, 2]), 1.0 / 3.0)
     assert normalized_damerau_levenshtein_distance([], []) == 0.0
     assert np.isclose(normalized_damerau_levenshtein_distance(list(range(10)), list(range(1, 11))), 0.2)
     assert normalized_damerau_levenshtein_distance([1, 2, 3, 4, 5, 6], [7, 8, 9, 7, 10, 11, 4]) == 1.0
Ejemplo n.º 33
0
def text_edit_ratio(doc, method=u'text_edit_ratio', ground_truth=None,
                    xml_in=True, gt_format='tei', clean_in=True, clean_gt=True,
                    divert=True):
    """
    Calculates the similarity of the input documents and a given ground truth
    using the Damerau-Levenshtein distance. The result is a value between 0.0
    (no commonality) and 1.0 (identical strings).

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.
        ground_truth (unicode): Ground truth location tuple or a list of ground
                                truths to choose from. When more than one is
                                given, the file sharing the longest prefix with
                                the input document is chosen.
        xml_in (bool): Switch to treat input as an TEI-XML document.
        gt_format (unicode): Switch to select ground truth format. Valid values
                             are 'tei', 'hocr', and 'text'.
        clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!)
        clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!)
        divert (bool): Switch selecting output diversion. If enabled the output
                       will be added to the tracking arguments and the input
                       document will be returned as the result of the task. Use
                       this to insert a statistical measure into a chain
                       without affecting the results.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    if not isinstance(ground_truth[0], basestring):
        ground_truth = find_matching(doc, ground_truth)
    with storage.StorageFile(*ground_truth) as fp:
        if gt_format == 'tei':
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            gt = t.getvalue()
        elif gt_format == 'hocr':
            gt = html.parse(fp).text_content()
        elif gt_format == 'text':
            gt = fp.read()
        else:
            raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.')
    with storage.StorageFile(*doc) as fp:
        if xml_in:
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            text = t.getvalue()
        else:
            text = fp.read()
    if clean_in:
        text = cleanup(text)
    if clean_gt:
        gt = cleanup(gt)
    logger.debug('Recognition result: \n{}'.format(text))
    logger.debug('Ground truth: \n{}'.format(gt))
    edist = 1.0 - normalized_damerau_levenshtein_distance(text, gt)
    logger.debug('Edit distance: {}'.format(damerau_levenshtein_distance(text, gt)))
    logger.debug('Accuracy: {}'.format(edist))
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(edit))
        return output_path
    else:
        return {'edit_ratio': edist, 'ground_truth': ground_truth, 'doc': doc}
def create_json_text_similarity(type='train'):
    if type == 'train':
        input_path = "../modified_data/ItemPairs_train_with_additional_pairs_fixed.csv"
        # input_path = "../input/ItemPairs_train.csv"
        # out_path = "../modified_data/json_text_sim_params_train.csv"
        out_path = "../orig_features/train_json_sim_param.csv"
    else:
        input_path = "../input/ItemPairs_test.csv"
        # out_path = "../modified_data/json_text_sim_params_test.csv"
        out_path = "../orig_features/test_json_sim_param.csv"

    print('Get table...')
    table = get_filled_table(input_path, type)
    only_cats = ['Опыт работы', 'Образование', 'Адрес', 'Забронированные даты', 'Модель',
                 'Знание языков', 'Отчёт Автокод', 'Кадастровый номер', 'Номер свидетельства ТС',
                 'VIN-номер', 'Корпус', 'Ссылка на документацию', 'Корпус / очередь', 'Страна', 'Название новостройки',
                 'Кадастровый номер участка', 'Адрес компании']

    vectorizer = prepareVectorizer()
    print('Write table in CSV ...')
    out = open(out_path, "w", encoding='UTF-8')
    out.write('itemID_1,itemID_2')
    # print header
    for key in only_cats:
        nm = get_param_name(key)
        out.write(',' + nm + '_dam_lev_norm')
    out.write(',address_tdidf')
    out.write('\n')

    for i, row in table.iterrows():
        out.write(str(row['itemID_1']))
        out.write(',')
        out.write(str(row['itemID_2']))
        if row['attrsJSON_1'] == -1:
            data1 = dict()
        else:
            data1 = json.loads(str(row['attrsJSON_1']))
        if row['attrsJSON_2'] == -1:
            data2 = dict()
        else:
            data2 = json.loads(str(row['attrsJSON_2']))

        for key in only_cats:
            if key not in data1 and key not in data2:
                out.write(',-1')
            else:
                str1 = ''
                str2 = ''
                if key in data1:
                    str1 = data1[key]
                if key in data2:
                    str2 = data2[key]
                val = normalized_damerau_levenshtein_distance(str1, str2)
                out.write(',' + str(val))

        # Адрес считаем tfidf
        for key in ['Адрес']:
            if key not in data1 and key not in data2:
                out.write(',-1')
            else:
                str1 = ''
                str2 = ''
                if key in data1:
                    str1 = data1[key]
                if key in data2:
                    str2 = data2[key]
                val = cosine_sim(str1, str2, vectorizer)
                out.write(',' + str(val))

        out.write('\n')
    out.close()
Ejemplo n.º 35
0
def damerau_levenshtein(str1, str2):
    aux = pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(
        str1, str2)
    return 1.0 - aux
Ejemplo n.º 36
0
	last_id = 0
	for match in pattern.findall(input):
		id = ids.get(match)
		if id is None:
			ids[match] = last_id
			last_id += 1
			id = last_id - 1
		result.append(id)	
	return result		

def tokenize_file(fname):
	with open(fname) as f:
		return tokenize(f.read())

def similarity(one_tokens, other_tokens)
	distance = normalized_damerau_levenshtein_distance(one_tokens, other_tokens) 
	return distance


if __name__ == "__main__":
	if len(sys.argv) != 3:
		print("Usage: %s [FILE1] [FILE2])" % sys.argv[0])
		exit(1)
		
	other = sys.argv[2]
	one = sys.argv[1]
	other_tokens = tokenize_file(other)
	one_tokens = tokenize_file(one)
	result = similarity(one_tokens, other_tokens)
	print(result)
def descr_damerau_levenshtein_norm(row):
    return normalized_damerau_levenshtein_distance(row['description_1'], row['description_2'])
def title_damerau_levenshtein_norm(row):
    return normalized_damerau_levenshtein_distance(row['title_1'], row['title_2'])
    # Read the content of the text file, coverting to unicode
    f = codecs.open(args.input, encoding='utf-8', mode='r')
    data = f.read().replace('\n', ' ').lower()
    f.close()

    # Eliminate special characters
    pattern = re.compile('[\W_]+')
    data_lower = pattern.sub(' ', data)

    firstname = ''
    n = 0
    for secondname in data_lower.split():
        if ((len(firstname) > 4) and (len(secondname) > 4)):  # two long words

            for idx, entrada in df_dict.iterrows():
                sim1 = 1.0 - normalized_damerau_levenshtein_distance(
                    firstname, entrada['first'])
                if (sim1 > threshold):
                    sim2 = 1.0 - normalized_damerau_levenshtein_distance(
                        secondname, entrada['second'])
                    if (sim2 > threshold):
                        print(args.input, firstname, secondname,
                              entrada['first'], entrada['second'],
                              str((sim1 + sim2) / 2))

        firstname = secondname

    end = datetime.datetime.now()
    diff = end - start
    print(args.input, str(diff.total_seconds()))
Ejemplo n.º 40
0
def stringDistance(string1, string2):
	return pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(string1, string2)
Ejemplo n.º 41
0
def get_damerau_levenshtein_distance(str1, str2, normalized=False):
    if normalized:
        dis = normalized_damerau_levenshtein_distance(str1, str2)
    else:
        dis = damerau_levenshtein_distance(str1, str2)
    return dis
Ejemplo n.º 42
0
from pyxdameraulevenshtein import damerau_levenshtein_distance, normalized_damerau_levenshtein_distance
import random
import string
import timeit

print('#edit distances (low edit distance means words are similar):')
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('smtih', 'smith', damerau_levenshtein_distance('smtih', 'smith')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('snapple', 'apple', damerau_levenshtein_distance('snapple', 'apple')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('testing', 'testtn', damerau_levenshtein_distance('testing', 'testtn')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('saturday', 'sunday', damerau_levenshtein_distance('saturday', 'sunday')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('Saturday', 'saturday', damerau_levenshtein_distance('Saturday', 'saturday')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('orange', 'pumpkin', damerau_levenshtein_distance('orange', 'pumpkin')))
print("damerau_levenshtein_distance('%s', '%s') = %d #unicode example\n" % ('Sjöstedt', 'Sjostedt', damerau_levenshtein_distance('Sjöstedt', 'Sjostedt'))) #unicode example

print('#normalized edit distances (low ratio means words are similar):')
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('smtih', 'smith', normalized_damerau_levenshtein_distance('smtih', 'smith')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('snapple', 'apple', normalized_damerau_levenshtein_distance('snapple', 'apple')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('testing', 'testtn', normalized_damerau_levenshtein_distance('testing', 'testtn')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('saturday', 'sunday', normalized_damerau_levenshtein_distance('saturday', 'sunday')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('Saturday', 'saturday', normalized_damerau_levenshtein_distance('Saturday', 'saturday')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('orange', 'pumpkin', normalized_damerau_levenshtein_distance('orange', 'pumpkin')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f #unicode example\n" % ('Sjöstedt', 'Sjostedt', normalized_damerau_levenshtein_distance('Sjöstedt', 'Sjostedt'))) #unicode example

print('#performance testing:')

#random words will be comprised of ascii letters, numbers, and spaces
chars = string.ascii_letters + string.digits + ' '
word1 = ''.join([random.choice(chars) for i in range(30)]) #generate a random string of characters of length 30
word2 = ''.join([random.choice(chars) for i in range(30)]) #and another
print("""timeit.timeit("damerau_levenshtein_distance('%s', '%s')", 'from pyxdameraulevenshtein import damerau_levenshtein_distance', number=500000) = %f seconds""" % \
	(word1, word2, timeit.timeit("damerau_levenshtein_distance('%s', '%s')" % (word1, word2), 'from pyxdameraulevenshtein import damerau_levenshtein_distance', number=500000)))