Ejemplos de normalized_damerau_levenshtein_distance en Python, ejemplos de pyxdameraulevenshtein.normalized_damerau_levenshtein_distance en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: feature.py Proyecto: imsorry1121/sn_crawler

def calNameScore(profile1, profile2):
	name1 = profile1["name"]
	name2 = profile2["name"]
	lang1 = profile1["nameLang"]
	lang2 = profile2["nameLang"]
	if lang1 == lang2:
		if lang1 in langs_not_western:
			return calNotWesternName(name1, name2)
		else:
			# return name_tools.match(name1, name2)
			return calNotWesternName(name1, name2)
	else:
		if len(name1)>3 and len(name2)>3:
			# 避免有時候翻譯錯，所以不用name_tools 例如chen shih ying 陳世穎卻翻成chen shiying
			try:
				if lang1 != "en":
					name1 = str(TextBlob(name1).translate(to="en"))
			except:
				pass
			try:
				if lang2 != "en":
					name2 = str(TextBlob(name2).translate(to="en"))
			except:
				pass
			return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2)
		else:
			return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2)

Ejemplo n.º 2

0

Mostrar archivo

def calNameScore(profile1, profile2):
    name1 = profile1["name"]
    name2 = profile2["name"]
    lang1 = profile1["nameLang"]
    lang2 = profile2["nameLang"]
    if lang1 == lang2:
        if lang1 in langs_not_western:
            return calNotWesternName(name1, name2)
        else:
            # return name_tools.match(name1, name2)
            return calNotWesternName(name1, name2)
    else:
        if len(name1) > 3 and len(name2) > 3:
            # 避免有時候翻譯錯，所以不用name_tools 例如chen shih ying 陳世穎卻翻成chen shiying
            try:
                if lang1 != "en":
                    name1 = str(TextBlob(name1).translate(to="en"))
            except:
                pass
            try:
                if lang2 != "en":
                    name2 = str(TextBlob(name2).translate(to="en"))
            except:
                pass
            return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(
                name1, name2)
        else:
            return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(
                name1, name2)

Ejemplo n.º 3

0

Mostrar archivo

def flag_na_n_ad(InputKamus, InputFTS):

    InputKamus = str(InputKamus)
    InputFTS = str(InputFTS)
    digits_input = ' '.join(re.findall(r'\d+', InputKamus))
    digits_fts = ' '.join(re.findall(r'\d+', InputFTS))
    no_int_kamus = ' '.join([
        x for x in InputKamus.split()
        if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())
    ])
    no_int_fts = ' '.join([
        x for x in InputFTS.split()
        if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())
    ])
    # try:
    if digits_input or digits_fts:
        nilai_digits = normalized_damerau_levenshtein_distance(
            digits_fts, digits_input)
        # print "number", nilai_digits
        # print 'DIGITS', nilai_digits
        if nilai_digits >= 0.5:
            return 'CONFIDENT'
        elif nilai_digits <= 0.4:
            return 'CONFIDENT'

    if no_int_kamus or no_int_fts:
        nilai_kalimat = normalized_damerau_levenshtein_distance(
            no_int_kamus, no_int_fts)
        # print "word", nilai_kalimat
        # print 'KATA', nilai_kalimat
        if nilai_kalimat >= 0.5:
            return 'CONFIDENT'
        elif nilai_kalimat <= 0.4:
            return 'CONFIDENT'

Ejemplo n.º 4

0

Mostrar archivo

def get_similarities(Features, url_input):
    """
    similarity metrics include: Levenshtein, jaro, damerau levenshtein, normalized_damerau_levenshtein,
    and hamming distance
    :param Features: input dictionary to add things to
    :param url_input
    :return: Features: after adding all similarity metrics
    """
    for n in itertools.chain(product_domain_names, brand_names):
        Features['url_levenshtein_distance_' + n] = Levenshtein.distance(
            url_input, n)
        Features['fqdn_levenshtein_distance_' + n] = Levenshtein.distance(
            Features['fqdn'], n)
        Features['url_jaro_winkler_distance_' + n] = jw.get_jaro_distance(
            url_input, n)
        Features['fqdn_jaro_winkler_distance_' + n] = jw.get_jaro_distance(
            Features['fqdn'], n)
        Features['url_damerau_levenshtein_distance_' +
                 n] = dl.damerau_levenshtein_distance(url_input, n)
        Features['fqdn_damerau_levenshtein_distance_' +
                 n] = dl.damerau_levenshtein_distance(Features['fqdn'], n)
        Features['url_damerau_levenshtein_normalized_distance_' +
                 n] = dl.normalized_damerau_levenshtein_distance(url_input, n)
        Features['fqdn_damerau_levenshtein_normalized_distance_' +
                 n] = dl.normalized_damerau_levenshtein_distance(
                     Features['fqdn'], n)
        if len(n) == len(url_input):
            Features['url_length_equals_' + n] = 1
            Features['url_hamming_distance_' + n] = hamming(url_input, n)
            Features['fqdn_hamming_distance_' + n] = hamming(
                Features['fqdn'], n)
        else:
            Features['url_length_equals_' + n] = 0
    return Features

Ejemplo n.º 5

0

Mostrar archivo

def get_str_similarity(source, target, label):
    try:
        source = source.lower()
        target = target.lower()
        long_sub = combine_google_fsq_for_gt.longest_substring(source, target)
        similarities = {
            label + "rosimilarity":
            combine_google_fsq_for_gt.get_ro_similarity(source, target),
            label + "dlevensimilarity":
            1 - normalized_damerau_levenshtein_distance(source, target),
            label + "levensimilarity":
            Levenshtein.ratio(source, target),
            label + "phoneticsimilarity":
            combine_google_fsq_for_gt.get_levenshtein_phonetic_similarity(
                source, target),
        }
        if (len(long_sub)):
            similarities[label +
                         "lenlongsubstring"] = len(long_sub) / len(source)
        else:
            similarities[label + "lenlongsubstring"] = 0
    except:
        similarities = {
            label + "rosimilarity": None,
            label + "dlevensimilarity": None,
            label + "levensimilarity": None,
            label + "phoneticsimilarity": None,
            label + "lenlongsubstring": None,
        }
    res = {k: v for k, v in similarities.items()}
    return res

Ejemplo n.º 6

0

Mostrar archivo

 def rank(self, src, tgt):
     ''' Returns the rank of the source and target paths. '''
     p = len(set(tgt) - set(src))
     a = normalized_damerau_levenshtein_distance(str(src), str(tgt)) + p
     b = max(len(src), len(tgt)) + p
     candidateScore = 1 - (a / b)
     return candidateScore

Ejemplo n.º 7

0

Mostrar archivo

Archivo: schema.py Proyecto: Thuruv/schema

 def rank(self, src, tgt):
     ''' Returns the rank of the source and target paths. '''
     p = len(set(tgt) - set(src))
     a = normalized_damerau_levenshtein_distance(unicode(src), unicode(tgt)) + p
     b = max(len(src), len(tgt)) + p
     candidateScore = 1 - (a/b)
     return candidateScore

Ejemplo n.º 8

0

Mostrar archivo

def norm_dld(l1, l2):
    ascii_start = 0
    # make a string for l1
    # all triples are unique...
    s1 = ''.join((chr(ascii_start+i) for i in range(len(l1))))
    s1_upd = list(s1)
    for i in range(len(l1)):
        for j in range(i+1, len(l1)):
            if trip_match(l1[i], l1[j]):
                s1_upd[j] = s1[i]
    s1_upd = ''.join(s1_upd)
    s2 = ''
    next_char = ascii_start + len(s1)
    for j in range(len(l2)):
        found = None
        #next_char = chr(ascii_start+len(s1)+j)
        for k in range(len(l1)):
            if trip_match(l2[j], l1[k]):
                found = s1_upd[k]
                #next_char = s1[k]
                break
        if found is None:
            s2 += chr(next_char)
            next_char += 1
            #assert next_char <= 128
        else:
            s2 += found
    # return 1- , since this thing gives 0 to perfect matches etc
    return 1.0-normalized_damerau_levenshtein_distance(s1_upd, s2)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: fulltext_similarity_DL_dir.py Proyecto: ialzuru/HuMaIN_Text_Extraction

def compare_DL(filename1, filename2):
    # Sanity check
    if not os.path.isfile(filename1):
        print('\nERROR: First source file ' + filename1 + ' was not found.\n')
        return (-3)
    if not os.path.isfile(filename2):
        print('\nERROR: Second source file ' + filename2 + ' was not found.\n')
        return (-4)

    # Read the content of the first file
    text1 = ""
    f1 = None
    with open(filename1) as f1:
        lines1 = [line.rstrip('\n') for line in f1]
    for line in lines1:
        text1 += line + ' '
    text1 = text1[:-1]

    # Read the content of the second file
    text2 = ""
    f2 = None
    with open(filename2) as f2:
        lines2 = [line.rstrip('\n') for line in f2]
    for line in lines2:
        text2 += line + ' '
    text2 = text2[:-1]

    sim = 1.0 - normalized_damerau_levenshtein_distance(text1, text2)
    return (sim)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: consensus.py Proyecto: ialzuru/HuMaIN_Self-aware_Information_Extraction

def winner(v_):
    n = v_.size
    sim = [0.0] * n

    if n == 1: return 0, 1.0

    i = 0
    while (i < n):
        j = i + 1
        while (j < n):
            s = 1.0 - normalized_damerau_levenshtein_distance(v_[i], v_[j])
            if s == 1.0:  # Two identical values
                return i, 1.0
            sim[i] = sim[i] + s
            sim[j] = sim[j] + s
            j = j + 1
        i = i + 1

    # Search maximum and save the index to return it
    sim_max = sim[0]
    i_max = 0
    i = 1
    while (i < n):
        if (sim_max < sim[i]):
            i_max = i
            sim_max = sim[i]
        i = i + 1

    return i_max, (sim_max / (n - 1))

Ejemplo n.º 11

0

Mostrar archivo

def winner(v_):
    n = len(v_)
    sim = [0.0] * n

    if n == 1: return -1, 1.0

    i = 0
    while (i < n):
        j = i + 1
        while (j < n):
            if v_[i] == v_[j]:
                return (i, 1.0)
            s = 1.0 - normalized_damerau_levenshtein_distance(v_[i], v_[j])
            sim[i] = sim[i] + s
            sim[j] = sim[j] + s
            j = j + 1
        i = i + 1

    # Search maximum and return the index
    sim_max = sim[0]
    i_max = 0
    i = 1
    while (i < n):
        if (sim_max < sim[i]):
            i_max = i
            sim_max = sim[i]
        i = i + 1

    return i_max, (sim_max / (n - 1))

Ejemplo n.º 12

0

Mostrar archivo

 def calc(self, search_word: str):
     word1_string = mojimoji.han_to_zen(search_word.lower())
     word2_string = mojimoji.han_to_zen(self.keyword.lower())
     self.distance = Decimal("1.0") - Decimal(
         str(
             normalized_damerau_levenshtein_distance(
                 word1_string, word2_string)))
     return self

Ejemplo n.º 13

0

Mostrar archivo

Archivo: feature.py Proyecto: imsorry1121/sn_crawler

def calDisplayNameScore(profile1, profile2):
	name1 = profile1["displayName"]
	name2 = profile2["displayName"]
	lang1 = profile1["displayNameLang"]
	lang2 = profile2["displayNameLang"]
	if lang1 == lang2:
		return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2)
	else:
		if len(name1)>3 and len(name2)>3:
			try:
				if lang1 != "en":
					name1 = str(TextBlob(name1).translate(to="en"))
				if lang2 != "en":
					name2 = str(TextBlob(name2).translate(to="en"))
			except:
				pass
			return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2)
		else:
			return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: database.py Proyecto: TheLurkingCat/Hackers-Linebot

def is_similar(source: str, target: str, threshold: float) -> bool:
    """
    使用現成模組，來源
    https://github.com/gfairchild/pyxDamerauLevenshtein
    Args:
        source: 使用者的查詢字串
        target: 資料庫的字串
        threshold: 一個介於0到1之間的值，表示可以容許兩個字串相異程度的最大值
    """
    distance = normalized_damerau_levenshtein_distance(source, target)
    return distance < threshold

Ejemplo n.º 15

0

Mostrar archivo

def suffix_measure(x, y, x2, res):
    for k in range(len(y)):
        b = y[k]
        h = b != 1
        b = b[h].clone()
        a = x2[:, x.shape[1]:][k]
        a = a[h].clone()
        a_res = a.cpu().numpy()
        b_res = b.cpu().numpy()

        res.append(
            1 -
            normalized_damerau_levenshtein_distance(list(a_res), list(b_res)))

Ejemplo n.º 16

0

Mostrar archivo

def calDisplayNameScore(profile1, profile2):
    name1 = profile1["displayName"]
    name2 = profile2["displayName"]
    lang1 = profile1["displayNameLang"]
    lang2 = profile2["displayNameLang"]
    if lang1 == lang2:
        return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(
            name1, name2)
    else:
        if len(name1) > 3 and len(name2) > 3:
            try:
                if lang1 != "en":
                    name1 = str(TextBlob(name1).translate(to="en"))
                if lang2 != "en":
                    name2 = str(TextBlob(name2).translate(to="en"))
            except:
                pass
            return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(
                name1, name2)
        else:
            return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(
                name1, name2)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: test_pyxdl.py Proyecto: dpolad/ugpHT

 def test_normalized_damerau_levenshtein_distance(self):
     assert normalized_damerau_levenshtein_distance(
         'smtih', 'smith') == 0.20000000298023224
     assert normalized_damerau_levenshtein_distance(
         'snapple', 'apple') == 0.2857142984867096
     assert normalized_damerau_levenshtein_distance(
         'testing', 'testtn') == 0.2857142984867096
     assert normalized_damerau_levenshtein_distance('saturday',
                                                    'sunday') == 0.375
     assert normalized_damerau_levenshtein_distance('Saturday',
                                                    'saturday') == 0.125
     assert normalized_damerau_levenshtein_distance('orange',
                                                    'pumpkin') == 1.0
     assert normalized_damerau_levenshtein_distance(
         'gifts', 'profit') == 0.8333333134651184
     assert normalized_damerau_levenshtein_distance('Sjöstedt',
                                                    'Sjostedt') == 0.125

Ejemplo n.º 18

0

Mostrar archivo

    def calculatePathDistance(self, pathA, pathB):
        semA, semB = self.createSemesterWords(pathA, pathB)
        for i, s1 in enumerate(semA):
            distRow = np.array([])
            for j, path2 in enumerate(self.paths):
                if j <= i:
                    distRow = np.append(distRow, [0])
                else:
                    distRow = np.append(distRow, [self.calculatePathDistance(path, path2)])
            if self.distanceMatrix is None:
                self.distanceMatrix = np.array([distRow])
            else:
                self.distanceMatrix = np.vstack((self.distanceMatrix, distRow))

        distance = normalized_damerau_levenshtein_distance(strSemesterA, strSemesterB)
        return distance

Ejemplo n.º 19

0

Mostrar archivo

Archivo: non_rg_metrics.py Proyecto: rchanda/Data2Doc

def norm_dld(l1, l2):
    ascii_start = 0
    assert len(l1) + len(l2) <= 128
    # make a string for l1
    # all triples are unique...
    s1 = ''.join((chr(ascii_start+i) for i in xrange(len(l1))))
    s2 = ''
    for j in xrange(len(l2)):
        next_char = chr(ascii_start+len(s1)+j)
        for k in xrange(len(l1)):
            if trip_match(l2[j], l1[k]):
                next_char = s1[k]
                break
        s2 += next_char
    # return 1- , since this thing gives 0 to perfect matches etc
    return 1.0-normalized_damerau_levenshtein_distance(s1, s2)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: stringcompare.py Proyecto: qitta/libhugin

def string_similarity_ratio(s1, s2):
    """
    A string compare function, using the Redcliff-Obershelp algorithm. For
    further details see: http://docs.python.org/3.3/library/difflib.html
    TODO: Levenshtein might be better for this purpose.

    :params s1, s2: Two input strings which will be compared
    :returns: A ratio between 0.0 (not similar at all) and 1.0 (probably the
    same string).

    """
    if s1 and s2:
        return 1 - normalized_damerau_levenshtein_distance(
            _clean_movie_title(s1),
            _clean_movie_title(s2)
        )

Ejemplo n.º 21

0

Mostrar archivo

Archivo: bandit.py Proyecto: phanideepgampa/data-to-text

def generate_reward(gold_summary, summary, gold_cp, cp, reward_type=1):
    #Bleu score
    # bleu = corpus_bleu([gold_summary],summary)

    cp = list(deepcopy(cp))
    # DLD
    if gold_cp:
        dld = normalized_damerau_levenshtein_distance(list(gold_cp), list(cp))
    else:
        dld = 0.
    boolean = np.zeros(len(cp))
    for pos, element in enumerate(cp):
        if element in gold_cp:
            boolean[pos] = 1
    precision = np.mean(boolean)
    recall = np.sum(boolean) / len(gold_cp)
    return (precision + recall + (1 - dld)) / 3

Ejemplo n.º 22

0

Mostrar archivo

    def compute_distances(reference, values, exp=None):
        if exp is None:
            exp = Config.SIMILARITY_EXPONENT

        sm = difflib.SequenceMatcher()
        sm.set_seq2(reference)
        sm_distances = []
        dl_distances = []
        for val in values:
            sm.set_seq1(val)
            sm_distances.append(sm.ratio())
            dl_distances.append(
                1 - normalized_damerau_levenshtein_distance(reference, val))
        sm_distances = np.array(sm_distances)
        dl_distances = np.array(dl_distances)
        dl_exp = np.power(dl_distances, exp)
        sm_exp = np.power(sm_distances, exp)
        dist_sum = dl_exp + sm_exp
        return np.power(dist_sum, 1 / exp)

Ejemplo n.º 23

0

Mostrar archivo

    def calc_distance_domains(self):
        not_in_top = []
        output = []
        for item in self.data:
            if item not in self.top_domains:
                not_in_top.append(item)

        for item in not_in_top:
            entry = {item: {}}
            flag = False
            for td in self.top_domains:
                dist = normalized_damerau_levenshtein_distance(item, td)
                if 0 < dist < 0.2:
                    entry[item][td] = dist
                    flag = True
            if flag is True:
                output.append(entry)
        if len(output) > 0:
            return output
        else:
            return None

Ejemplo n.º 24

0

Mostrar archivo

Archivo: ensemblescorer.py Proyecto: kartoone/cccluster

def processtxn(txn, choices):
    maxscoreJ = 0
    matchstrJ = ""
    maxscoreDL = 0
    matchstrDL = ""
    maxscoreNDL = 0
    matchstrNDL = ""
    for c in choices:
        scoreJ = jaro.jaro_metric(txn, c)
        scoreDL = 1000 - damerau_levenshtein_distance(txn, c)
        scoreNDL = 1 - normalized_damerau_levenshtein_distance(txn, c)
        if scoreJ > maxscoreJ:
            matchstrJ = c
            maxscoreJ = scoreJ
        if scoreDL > maxscoreDL:
            matchstrDL = c
            maxscoreDL = scoreDL
        if scoreNDL > maxscoreNDL:
            matchstrNDL = c
            maxscoreNDL = scoreNDL
    return {'jaro': matchstrJ, 'dl': matchstrDL, 'ndl': matchstrNDL}

Ejemplo n.º 25

0

Mostrar archivo

def get_likely_cfg_array_version(glycan_list, distance_threshold=2.0):
    '''Get the most likely CFG glycan array given a list of glycans.

    Uses a scaled Levenshtein distance to compute similarity between glycan strings,
    and returns the array with the minimum sum of scaled levenshtein distances
    for each pair of glycans in the glycan list and corresponding reference array.

    We need to do this because sometimes the array version is not provided, and 
    there are slight spelling errors in the provided glycan names. It is easier to
    match to a reference list of glycans for a particular array version, with all 
    errors corrected.

    Args:
        glycan_list (list): A list of glycan strings ordered by index.
        distance_threshold (float): A threshold for total scaled Levenshtein distance for calling a match.
    Returns:
        CFG glycan list (list), most likely array version (string), number of mismatches (int), scaled Levenshtein distance (float)
    '''
    glycan_list = list(glycan_list)
    for i, glycan in enumerate(glycan_list):
        # Handle odd characters in some excel files. Nonbreaking spaces, greek letters etc.
        glycan_list[i] = glycan.replace('–', '-').replace('α', 'a') \
                            .replace('β', 'b').replace('[', '(') \
                            .replace(']', ')').replace(' ', '').replace(u"\u00A0", '')
    likely_array = None
    likely_array_mismatches = None
    scaled_levenshtein_total = 0
    for key, value in cfg_array_versions.items():
        # Take into account glycans which are almost the same.
        array_version = [x.replace(' ', '') for x in value[1]]
        scaled_levenshtein_sum = np.sum([normalized_damerau_levenshtein_distance(x, y) for x, y in zip_longest(glycan_list, array_version, fillvalue='')])
        non_matches = len([x for x in zip(glycan_list, array_version) if x[0] != x[1]])
        #if not likely_array or non_matches < likely_array_mismatches:
        if not likely_array or scaled_levenshtein_sum < scaled_levenshtein_total:
            likely_array = key
            likely_array_mismatches = non_matches
            scaled_levenshtein_total = scaled_levenshtein_sum
    if scaled_levenshtein_total > distance_threshold:
        raise ValueError("Glycan list does not match to known array versions.")
    return list(cfg_array_versions[likely_array][1]), likely_array, likely_array_mismatches, scaled_levenshtein_total

Ejemplo n.º 26

0

Mostrar archivo

Archivo: combine_google_fsq_for_gt.py Proyecto: MiliasV/poi-feature-mining

def get_str_similarity(source, target):
    source = source.lower()
    target = target.lower()
    long_sub = longest_substring(source, target)
    #"long_substring": longest_substring(source, target),
    similarities = {
        "ro_similarity":
        get_ro_similarity(source, target),
        "dleven_similarity":
        1 - normalized_damerau_levenshtein_distance(source, target),
        "leven_similarity":
        Levenshtein.ratio(source, target),
        "phonetic_similarity":
        get_levenshtein_phonetic_similarity(source, target),
    }
    if (len(long_sub)):
        similarities["len_long_substring"] = len(long_sub) / len(source)
    else:
        similarities["len_long_substring"] = 0
    res = {k: v for k, v in similarities.items() if v is not None}
    avg_sim = sum(res.values()) / len(res.values())
    return avg_sim

Ejemplo n.º 27

0

Mostrar archivo

Archivo: dameraulevenshtein.py Proyecto: VDA-univie/event-distance-metric

 def calculatePathDistance(self, pathA, pathB):
     courseNames = abstract.extractAllCourseNames([pathA, pathB])
     idDict = dict()
     for i, n in enumerate(courseNames):
         idDict[n] = i
     semesterA, semesterB = [], []
     for sem in pathA.semester:
         tempArr = []
         for c in sem:
             tempArr.append(chr(idDict[c.name]))
         tempArr.sort()
         semesterA.append(''.join(tempArr))
     for sem in pathB.semester:
         tempArr = []
         for c in sem:
             tempArr.append(chr(idDict[c.name]))
         tempArr.sort()
         semesterB.append(''.join(tempArr))
     strSemesterA = ''.join(semesterA)
     strSemesterB = ''.join(semesterB)
     distance = normalized_damerau_levenshtein_distance(
         strSemesterA, strSemesterB)
     return distance

Ejemplo n.º 28

0

Mostrar archivo

Archivo: featureGenerator.py Proyecto: sachinvarriar/ditk

    def getCandidateBaseFeature(self, candidate, num_candidates, max_prior):
        # base feature_num
        features = []
        m_label = candidate.getMentionText()
        # number of candidates
        features.append(num_candidates)
        # max_prior
        features.append(max_prior)

        # string similarity features
        if self._has_str_sim:
            c_label = candidate.label
            if self._lowercase: c_label = c_label.lower()
            # edit_distance
            features.append(
                normalized_damerau_levenshtein_distance(c_label, m_label))
            # is equal
            features.append(1 if c_label == m_label else 0)
            # mlabel contains clabel
            features.append(1 if c_label in m_label else 0)
            # clabel contains mlabel
            features.append(1 if m_label in c_label else 0)
            # mlabel starts with clabel
            features.append(1 if m_label.startswith(c_label) else 0)
            # clabel starts with mlabel
            features.append(1 if c_label.startswith(m_label) else 0)
            # mlabel ends with clabel
            features.append(1 if m_label.endswith(c_label) else 0)
            # clabel ends with mlabel
            features.append(1 if c_label.endswith(m_label) else 0)

        # prior
        if self._has_prior:
            # entity prior
            features.append(candidate.getEntityMentionPrior())

        return features

Ejemplo n.º 29

0

Mostrar archivo

            for j in range(FLAGS.batch_size):
                try:
                    eocIndex = h[j].tolist().index(endOfCaseId) + 1
                except ValueError, e:
                    eocIndex = None
                suffixes_predicted[j] = h[j].tolist()[:eocIndex]
            suffixes_predicted_alpha = [
                ''.join([num2alpha[element] for element in suffix])
                for suffix in suffixes_predicted
            ]
            suffixes_alpha = [
                ''.join([num2alpha[element] for element in suffix])
                for suffix in suffixes[0]
            ]
            for j in range(FLAGS.batch_size):
                outputFile.write("{0}\n".format(suffixes_predicted_alpha[j]))
                outputFile.write("{0}\n".format(suffixes_alpha[j]))
                distance = dl.normalized_damerau_levenshtein_distance(
                    suffixes_predicted_alpha[j], suffixes_alpha[j])
                outputFile.write("{0}\n".format(distance))
                sum_distance += distance
            outputFile.flush()
            os.fsync(outputFile.fileno())
            print("Batch {} of {} ".format(batchNum, numBatches))

        outputFile.write("average edit_distance: {0}\n".format(
            sum_distance / (FLAGS.batch_size * numBatches)))
        outputFile.close()

    resultFile.close()

Ejemplo n.º 30

0

Mostrar archivo

Archivo: wc.py Proyecto: jackdreilly/thesis

import requests
import json
import pyxdameraulevenshtein

matches = json.loads(requests.get('http://worldcup.kimonolabs.com/api/matches?sort=startTime&fields=homeScore,awayScore,startTime,awayTeamId,homeTeamId,id&apikey=72519cb45986ce5ffd15020a5e4b1a70').content)
print matches
gabriel_teams = list(set('Brazil,Mexico,Spain,Chile,Colombia,Cote Divoire,Uruguay,England,Switzerland,France,Argentina,Iran,Germany,Ghana,Belgium,Russia,Brazil,Cameroon,Spain,Australia,Colombia,Japan,Uruguay,Italy,Switzerland,Honduras,Argentina,Nigeria,Germany,USA,Belgium,Korea Republic,Cameroon,Croatia,Australia,Netherlands,Japan,Greece,Italy,Costa Rica,Honduras,Ecuador,Nigeria,Bosnia-Herzegovina,USA,Portugal,Korea Republic,Algeria'.split(',')))
teams = json.loads(requests.get('http://worldcup.kimonolabs.com/api/teams?apikey=72519cb45986ce5ffd15020a5e4b1a70').content)
names = list(set([team['name'].encode('utf-8') for team in teams]))
print names
matches = [teams[min([(i,pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(gteam, team)) for i, team in enumerate(names)], key = lambda x: x[1])[0]]['id'] for gteam in gabriel_teams]
for a,b in zip(gabriel_teams, matches):
	try:
		print a,unicode(b)
	except:
		pass

Ejemplo n.º 31

0

Mostrar archivo

 def extract(self, source, paraphrase, position):
     # Levenshtein distance
     return normalized_damerau_levenshtein_distance(tokenize(source),
                                                    tokenize(paraphrase))

Ejemplo n.º 32

0

Mostrar archivo

Archivo: test_pyxdl.py Proyecto: gfairchild/pyxDamerauLevenshtein

 def test_normalized_damerau_levenshtein_distance(self):
     assert normalized_damerau_levenshtein_distance('smtih', 'smith') == 0.20000000298023224
     assert normalized_damerau_levenshtein_distance('', '') == 0
     assert normalized_damerau_levenshtein_distance('snapple', 'apple') == 0.2857142984867096
     assert normalized_damerau_levenshtein_distance('testing', 'testtn') == 0.2857142984867096
     assert normalized_damerau_levenshtein_distance('saturday', 'sunday') == 0.375
     assert normalized_damerau_levenshtein_distance('Saturday', 'saturday') == 0.125
     assert normalized_damerau_levenshtein_distance('orange', 'pumpkin') == 1.0
     assert normalized_damerau_levenshtein_distance('gifts', 'profit') == 0.8333333134651184
     assert normalized_damerau_levenshtein_distance('Sjöstedt', 'Sjostedt') == 0.125
     assert np.isclose(normalized_damerau_levenshtein_distance([1, 2, 3], [1, 3, 2]), 1.0 / 3.0)
     assert normalized_damerau_levenshtein_distance([], []) == 0.0
     assert np.isclose(normalized_damerau_levenshtein_distance(list(range(10)), list(range(1, 11))), 0.2)
     assert normalized_damerau_levenshtein_distance([1, 2, 3, 4, 5, 6], [7, 8, 9, 7, 10, 11, 4]) == 1.0

Ejemplo n.º 33

0

Mostrar archivo

Archivo: stats.py Proyecto: amitdo/nidaba

def text_edit_ratio(doc, method=u'text_edit_ratio', ground_truth=None,
                    xml_in=True, gt_format='tei', clean_in=True, clean_gt=True,
                    divert=True):
    """
    Calculates the similarity of the input documents and a given ground truth
    using the Damerau-Levenshtein distance. The result is a value between 0.0
    (no commonality) and 1.0 (identical strings).

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.
        ground_truth (unicode): Ground truth location tuple or a list of ground
                                truths to choose from. When more than one is
                                given, the file sharing the longest prefix with
                                the input document is chosen.
        xml_in (bool): Switch to treat input as an TEI-XML document.
        gt_format (unicode): Switch to select ground truth format. Valid values
                             are 'tei', 'hocr', and 'text'.
        clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!)
        clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!)
        divert (bool): Switch selecting output diversion. If enabled the output
                       will be added to the tracking arguments and the input
                       document will be returned as the result of the task. Use
                       this to insert a statistical measure into a chain
                       without affecting the results.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    if not isinstance(ground_truth[0], basestring):
        ground_truth = find_matching(doc, ground_truth)
    with storage.StorageFile(*ground_truth) as fp:
        if gt_format == 'tei':
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            gt = t.getvalue()
        elif gt_format == 'hocr':
            gt = html.parse(fp).text_content()
        elif gt_format == 'text':
            gt = fp.read()
        else:
            raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.')
    with storage.StorageFile(*doc) as fp:
        if xml_in:
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            text = t.getvalue()
        else:
            text = fp.read()
    if clean_in:
        text = cleanup(text)
    if clean_gt:
        gt = cleanup(gt)
    logger.debug('Recognition result: \n{}'.format(text))
    logger.debug('Ground truth: \n{}'.format(gt))
    edist = 1.0 - normalized_damerau_levenshtein_distance(text, gt)
    logger.debug('Edit distance: {}'.format(damerau_levenshtein_distance(text, gt)))
    logger.debug('Accuracy: {}'.format(edist))
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(edit))
        return output_path
    else:
        return {'edit_ratio': edist, 'ground_truth': ground_truth, 'doc': doc}

Ejemplo n.º 34

0

Mostrar archivo

Archivo: s3_prepare_json_data.py Proyecto: ZFTurbo/KAGGLE_AVITO_2016

def create_json_text_similarity(type='train'):
    if type == 'train':
        input_path = "../modified_data/ItemPairs_train_with_additional_pairs_fixed.csv"
        # input_path = "../input/ItemPairs_train.csv"
        # out_path = "../modified_data/json_text_sim_params_train.csv"
        out_path = "../orig_features/train_json_sim_param.csv"
    else:
        input_path = "../input/ItemPairs_test.csv"
        # out_path = "../modified_data/json_text_sim_params_test.csv"
        out_path = "../orig_features/test_json_sim_param.csv"

    print('Get table...')
    table = get_filled_table(input_path, type)
    only_cats = ['Опыт работы', 'Образование', 'Адрес', 'Забронированные даты', 'Модель',
                 'Знание языков', 'Отчёт Автокод', 'Кадастровый номер', 'Номер свидетельства ТС',
                 'VIN-номер', 'Корпус', 'Ссылка на документацию', 'Корпус / очередь', 'Страна', 'Название новостройки',
                 'Кадастровый номер участка', 'Адрес компании']

    vectorizer = prepareVectorizer()
    print('Write table in CSV ...')
    out = open(out_path, "w", encoding='UTF-8')
    out.write('itemID_1,itemID_2')
    # print header
    for key in only_cats:
        nm = get_param_name(key)
        out.write(',' + nm + '_dam_lev_norm')
    out.write(',address_tdidf')
    out.write('\n')

    for i, row in table.iterrows():
        out.write(str(row['itemID_1']))
        out.write(',')
        out.write(str(row['itemID_2']))
        if row['attrsJSON_1'] == -1:
            data1 = dict()
        else:
            data1 = json.loads(str(row['attrsJSON_1']))
        if row['attrsJSON_2'] == -1:
            data2 = dict()
        else:
            data2 = json.loads(str(row['attrsJSON_2']))

        for key in only_cats:
            if key not in data1 and key not in data2:
                out.write(',-1')
            else:
                str1 = ''
                str2 = ''
                if key in data1:
                    str1 = data1[key]
                if key in data2:
                    str2 = data2[key]
                val = normalized_damerau_levenshtein_distance(str1, str2)
                out.write(',' + str(val))

        # Адрес считаем tfidf
        for key in ['Адрес']:
            if key not in data1 and key not in data2:
                out.write(',-1')
            else:
                str1 = ''
                str2 = ''
                if key in data1:
                    str1 = data1[key]
                if key in data2:
                    str2 = data2[key]
                val = cosine_sim(str1, str2, vectorizer)
                out.write(',' + str(val))

        out.write('\n')
    out.close()

Ejemplo n.º 35

0

Mostrar archivo

Archivo: datasetcreator.py Proyecto: LinkGeoML/ToponymInterlinking

def damerau_levenshtein(str1, str2):
    aux = pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(
        str1, str2)
    return 1.0 - aux

Ejemplo n.º 36

0

Mostrar archivo

	last_id = 0
	for match in pattern.findall(input):
		id = ids.get(match)
		if id is None:
			ids[match] = last_id
			last_id += 1
			id = last_id - 1
		result.append(id)	
	return result		

def tokenize_file(fname):
	with open(fname) as f:
		return tokenize(f.read())

def similarity(one_tokens, other_tokens)
	distance = normalized_damerau_levenshtein_distance(one_tokens, other_tokens) 
	return distance


if __name__ == "__main__":
	if len(sys.argv) != 3:
		print("Usage: %s [FILE1] [FILE2])" % sys.argv[0])
		exit(1)
		
	other = sys.argv[2]
	one = sys.argv[1]
	other_tokens = tokenize_file(other)
	one_tokens = tokenize_file(one)
	result = similarity(one_tokens, other_tokens)
	print(result)

Ejemplo n.º 37

0

Mostrar archivo

Archivo: s4_prepare_titles_params.py Proyecto: ZFTurbo/KAGGLE_AVITO_2016

def descr_damerau_levenshtein_norm(row):
    return normalized_damerau_levenshtein_distance(row['description_1'], row['description_2'])

Ejemplo n.º 38

0

Mostrar archivo

Archivo: s4_prepare_titles_params.py Proyecto: ZFTurbo/KAGGLE_AVITO_2016

def title_damerau_levenshtein_norm(row):
    return normalized_damerau_levenshtein_distance(row['title_1'], row['title_2'])

Ejemplo n.º 39

0

Mostrar archivo

Archivo: p2t1_getByDict.py Proyecto: ialzuru/HuMaIN_Self-aware_Information_Extraction

    # Read the content of the text file, coverting to unicode
    f = codecs.open(args.input, encoding='utf-8', mode='r')
    data = f.read().replace('\n', ' ').lower()
    f.close()

    # Eliminate special characters
    pattern = re.compile('[\W_]+')
    data_lower = pattern.sub(' ', data)

    firstname = ''
    n = 0
    for secondname in data_lower.split():
        if ((len(firstname) > 4) and (len(secondname) > 4)):  # two long words

            for idx, entrada in df_dict.iterrows():
                sim1 = 1.0 - normalized_damerau_levenshtein_distance(
                    firstname, entrada['first'])
                if (sim1 > threshold):
                    sim2 = 1.0 - normalized_damerau_levenshtein_distance(
                        secondname, entrada['second'])
                    if (sim2 > threshold):
                        print(args.input, firstname, secondname,
                              entrada['first'], entrada['second'],
                              str((sim1 + sim2) / 2))

        firstname = secondname

    end = datetime.datetime.now()
    diff = end - start
    print(args.input, str(diff.total_seconds()))

Ejemplo n.º 40

0

Mostrar archivo

Archivo: utility.py Proyecto: imsorry1121/sn_crawler

def stringDistance(string1, string2):
	return pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(string1, string2)

Ejemplo n.º 41

0

Mostrar archivo

def get_damerau_levenshtein_distance(str1, str2, normalized=False):
    if normalized:
        dis = normalized_damerau_levenshtein_distance(str1, str2)
    else:
        dis = damerau_levenshtein_distance(str1, str2)
    return dis

Ejemplo n.º 42

0

Mostrar archivo

Archivo: examples.py Proyecto: andersjo/pyxDamerauLevenshtein

from pyxdameraulevenshtein import damerau_levenshtein_distance, normalized_damerau_levenshtein_distance
import random
import string
import timeit

print('#edit distances (low edit distance means words are similar):')
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('smtih', 'smith', damerau_levenshtein_distance('smtih', 'smith')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('snapple', 'apple', damerau_levenshtein_distance('snapple', 'apple')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('testing', 'testtn', damerau_levenshtein_distance('testing', 'testtn')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('saturday', 'sunday', damerau_levenshtein_distance('saturday', 'sunday')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('Saturday', 'saturday', damerau_levenshtein_distance('Saturday', 'saturday')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('orange', 'pumpkin', damerau_levenshtein_distance('orange', 'pumpkin')))
print("damerau_levenshtein_distance('%s', '%s') = %d #unicode example\n" % ('Sjöstedt', 'Sjostedt', damerau_levenshtein_distance('Sjöstedt', 'Sjostedt'))) #unicode example

print('#normalized edit distances (low ratio means words are similar):')
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('smtih', 'smith', normalized_damerau_levenshtein_distance('smtih', 'smith')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('snapple', 'apple', normalized_damerau_levenshtein_distance('snapple', 'apple')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('testing', 'testtn', normalized_damerau_levenshtein_distance('testing', 'testtn')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('saturday', 'sunday', normalized_damerau_levenshtein_distance('saturday', 'sunday')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('Saturday', 'saturday', normalized_damerau_levenshtein_distance('Saturday', 'saturday')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('orange', 'pumpkin', normalized_damerau_levenshtein_distance('orange', 'pumpkin')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f #unicode example\n" % ('Sjöstedt', 'Sjostedt', normalized_damerau_levenshtein_distance('Sjöstedt', 'Sjostedt'))) #unicode example

print('#performance testing:')

#random words will be comprised of ascii letters, numbers, and spaces
chars = string.ascii_letters + string.digits + ' '
word1 = ''.join([random.choice(chars) for i in range(30)]) #generate a random string of characters of length 30
word2 = ''.join([random.choice(chars) for i in range(30)]) #and another
print("""timeit.timeit("damerau_levenshtein_distance('%s', '%s')", 'from pyxdameraulevenshtein import damerau_levenshtein_distance', number=500000) = %f seconds""" % \
	(word1, word2, timeit.timeit("damerau_levenshtein_distance('%s', '%s')" % (word1, word2), 'from pyxdameraulevenshtein import damerau_levenshtein_distance', number=500000)))