Exemple #1
0
def stringDistance_2(AuthorIdPaperId, dict_coauthor,
                     dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author,
                     Paper, Conference, Journal):
    authorId = AuthorIdPaperId.authorId
    paperId = AuthorIdPaperId.paperId

    key = "%s|%s" % (paperId, authorId)
    name = str(dict_paperIdAuthorId_to_name_aff[key]["name"])
    aff = str(dict_paperIdAuthorId_to_name_aff[key]["affiliation"])

    T = list(Author[Author["Id"] == int(authorId)].values)[0]
    a_name = str(T[1])
    a_aff = str(T[2])
    if a_name == "nan":
        a_name = ""
    if a_aff == "nan":
        a_aff = ""

    feat_list = []

    # 计算 a_name 与 name 的距离
    lcs_distance = []
    lss_distance = []
    lev_distance = []
    for _name in name.split("##"):
        lcs_distance.append(len(longest_common_subsequence(a_name, _name)))
        lss_distance.append(len(longest_common_substring(a_name, _name)))
        # 尝试不同的字符串相似度算法
        # lev_distance.append(Levenshtein_distance(a_name, _name))
        lev_distance.append(textdistance.JaroWinkler()(a_name, _name))
        # lev_distance.append(textdistance.Jaccard()(a_name, _name))

    feat_list += [
        np.mean(lcs_distance),
        np.mean(lss_distance),
        np.mean(lev_distance)
    ]

    # 计算 a_aff 与 aff 的距离
    lcs_distance = []
    lss_distance = []
    lev_distance = []
    for _aff in aff.split("##"):
        lcs_distance.append(len(longest_common_subsequence(a_aff, _aff)))
        lss_distance.append(len(longest_common_substring(a_aff, _aff)))
        # 尝试不同的字符串相似度算法
        # lev_distance.append(Levenshtein_distance(a_aff, _aff))
        lev_distance.append(textdistance.JaroWinkler()(a_aff, _aff))
        # lev_distance.append(textdistance.Jaccard()(a_aff, _aff))

    feat_list += [
        np.mean(lcs_distance),
        np.mean(lss_distance),
        np.mean(lev_distance)
    ]

    # # feat_list
    # feat_list = [feat_list[0],feat_list[1], feat_list[3],feat_list[4]]

    return util.get_feature_by_list(feat_list)
Exemple #2
0
def get_jaro_to_list(first4jaro, list4jaro, factor=0.9):
    result = [[0 for x in range(len(list4jaro))]
              for y in range(len(first4jaro))]
    loc_data = 0.0
    #If loc_data =0, we take the first one
    loc_i = 0
    loc_j = 0
    for i, item in enumerate(first4jaro):
        for j, data in enumerate(list4jaro):
            if (item[1] == "") or (data[1] == ""):
                result[i][j] = textdistance.JaroWinkler(winklerize=False,
                                                        external=False)(
                                                            item[0], data[0])
            else:
                result[i][j] = textdistance.JaroWinkler(
                    winklerize=False, external=False)(
                        item[0], data[0]) * textdistance.JaroWinkler(
                            winklerize=False, external=False)(item[1], data[1])
            if result[i][j] > loc_data:
                loc_data = result[i][j]
                loc_i = i
                loc_j = j
    first2return = first4jaro[:loc_i] + first4jaro[loc_i + 1:]
    list4return = list4jaro[:loc_j] + list4jaro[loc_j + 1:]
    if (len(first2return) == 0) or (len(list4return) == 0):
        dif = abs(len(first2return) - len(list4return))
        return loc_data * loc_data * math.pow(factor, dif)
    else:
        return loc_data * loc_data * get_jaro_to_list(first2return,
                                                      list4return)
Exemple #3
0
def stringDistance_1(AuthorIdPaperId, dict_coauthor,
                     dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author,
                     Paper, Conference, Journal):
    authorId = AuthorIdPaperId.authorId
    paperId = AuthorIdPaperId.paperId

    key = "%s|%s" % (paperId, authorId)
    name = str(dict_paperIdAuthorId_to_name_aff[key]["name"])
    aff = str(dict_paperIdAuthorId_to_name_aff[key]["affiliation"])

    T = list(Author[Author["Id"] == int(authorId)].values)[0]
    a_name = str(T[1])
    a_aff = str(T[2])
    if a_name == "nan":
        a_name = ""
    if a_aff == "nan":
        a_aff = ""

    feat_list = []

    # 计算 a_name 与 name 的距离
    feat_list.append(len(longest_common_subsequence(a_name, name)))
    feat_list.append(len(longest_common_substring(a_name, name)))
    # feat_list.append(Levenshtein_distance(a_name, name))
    # feat_list.append(textdistance.Jaccard()(a_name, name))
    feat_list.append(textdistance.JaroWinkler()(a_name, name))

    # 计算 a_aff 与 aff 的距离
    feat_list.append(len(longest_common_subsequence(a_aff, aff)))
    feat_list.append(len(longest_common_substring(a_aff, aff)))
    # feat_list.append(Levenshtein_distance(a_aff, aff))
    # feat_list.append(textdistance.Jaccard()(a_aff, aff))
    feat_list.append(textdistance.JaroWinkler()(a_aff, aff))

    return util.get_feature_by_list(feat_list)
Exemple #4
0
def get_compared_data_file(data, language="en", data_kind="surname"):
    '''
    This function will compare the given name with the current data input
    '''
    if language in LANGUAGES_FILES.keys():
        if data_kind in LANGUAGES_FILES[language].keys():
            data_in_met = adapted_doublemetaphone(data, language=language)
            total_data = []
            for word, met_value in LANGUAGES_DATA[language][data_kind].items():
                if met_value == data_in_met:
                    total_data.append(word)
            #If the value is already available, we just return it
            if data in LANGUAGES_DATA[language][data_kind].keys():
                return data, 1.0
            else:
                data_temp = data.lower()
                norm = LANGUAGES_FILES[language]["normalize"]
                for notnorm in norm.keys():
                    data_temp = data_temp.replace(notnorm, norm[notnorm])
                results = {}
                for candidate in total_data:
                    candidate_temp = candidate.lower()
                    for notnorm in norm.keys():
                        candidate_temp = candidate_temp.replace(
                            notnorm, norm[notnorm])
                    results[candidate] = textdistance.JaroWinkler(
                        winklerize=False, external=False)(candidate_temp,
                                                          data_temp)
                if (any(results)):
                    return max(results, key=results.get), max(results.values())
                else:
                    return data, -1.0
    return data, -1.0
Exemple #5
0
def get_name_from_fullname(full_name,
                           list_father_surnames,
                           list_mother_surnames,
                           language="en"):
    '''
    Given a full name, including surname, this function will provide out the first name of
    the person removing the surname of the person
    '''
    merged_list = list_father_surnames + list_mother_surnames
    for surname in merged_list:
        temp_surname = surname.split(" ")
        if len(temp_surname) > 1:
            for i, _ in enumerate(temp_surname):
                if temp_surname[i] in LANGUAGES_ADDS[language]:
                    temp_surname[i] = ""
            new_surname = " ".join(temp_surname).rstrip().strip()
            if (new_surname not in merged_list):
                merged_list.append(new_surname)
    merged_metaphore = []
    for data in merged_list:
        if adapted_doublemetaphone(data, language) not in merged_metaphore:
            merged_metaphore.append(adapted_doublemetaphone(data, language))
    full_name_list = get_splitted_name_from_complete_name(full_name, language)
    for i, value in enumerate(full_name_list[0]):
        #We remove from the specific particle the particles from each language that are used inside surnames
        #to connect
        check_surname = value.split(" ")
        if len(check_surname) > 1:
            for j, value in enumerate(check_surname):
                if (check_surname[j].lower() in LANGUAGES_ADDS[language]):
                    check_surname[j] = ""
        adapted_surname = "".join(check_surname).rstrip()
        if (adapted_doublemetaphone(value, language)
                in merged_metaphore) or (adapted_doublemetaphone(
                    adapted_surname, language) in merged_metaphore):
            #The methapone algorithm is not perfect... so that we include here a crosschecking of very close phonetical, but far written data.
            similar = 0
            for compared in merged_list:
                if textdistance.JaroWinkler(winklerize=False, external=False)(
                        adapted_surname, compared) > similar:
                    similar = textdistance.JaroWinkler(
                        winklerize=False, external=False)(adapted_surname,
                                                          compared)
            if similar > THRESHOLD_JARO:
                full_name_list[0][i] = ""
    return " ".join(full_name_list[0]).rstrip()
Exemple #6
0
def jaro_winkler_dist(
    seq_x: Sequence[Hashable], seq_y: Sequence[Hashable], normal: bool = False
) -> float:
    """
    Computes the Jaro-Winkler distance between two sequences.

    This function returns the value from the implementation provided by
    the `textdistance` library.

    The function accepts the `normal` parameter to have calls equivalent to those
    of other methods, but it is redundant as the Jaccard distance is already
    in range [0..1].

    Example
    ********

    .. code-block:: python

        >>> seqsim.edit.jaro_winkler_dist("abc", "bcde")
        0.2777777777777778

    References
    ***********

    Jaro, M. A. (1989). "Advances in record linkage methodology as applied to the 1985 census of Tampa Florida".
    Journal of the American Statistical Association. 84 (406): 414–20. doi:10.1080/01621459.1989.10478785.

    Jaro, M. A. (1995). "Probabilistic linkage of large public health data file". Statistics in Medicine. 14 (5–7):
    491–8. doi:10.1002/sim.4780140510. PMID 7792443.

    Winkler, W. E. (1990). "String Comparator Metrics and Enhanced Decision Rules in the Fellegi-Sunter Model of
    Record Linkage". Proceedings of the Section on Survey Research Methods. American Statistical Association: 354–359.

    Winkler, W. E. (2006). "Overview of Record Linkage and Current Research Directions" (PDF). Research
    Report Series, RRS.

    :param seq_x: The first sequence of elements to be compared.
    :param seq_y: The second sequence of elements to be compared.
    :param normal: Dummy parameter, see comment above.
    :return: The Jaro-Winkler distance between the two sequences.
    """

    dist = textdistance.JaroWinkler(winklerize=True, external=False)(seq_x, seq_y)

    if normal:
        logging.warning(
            "Jaro-Winkler distance is always in [0..1] range, no need for `normal` parameter."
        )

    return 1.0 - dist
Exemple #7
0
def get_feature(author_id, paper_id):
    distance_funcs = [
        textdistance.JaroWinkler(),
        textdistance.Jaccard(),
        textdistance.Levenshtein(),
        fuzz.token_sort_ratio
    ]

    a = author_data[author_data['Id'] == author_id].iloc[0]
    p_a = paper_author_data[(paper_author_data['PaperId'] == paper_id) &
                            (paper_author_data['AuthorId'] == author_id)]
    name_l = [10000 for _ in range(len(distance_funcs))]
    aff_l = [10000 for _ in range(len(distance_funcs))]
    for _, row in p_a.iterrows():
        for i, f in enumerate(distance_funcs):
            name_l[i] = min(name_l[i], f(a.Name, row.Name))
            aff_l[i] = min(aff_l[i], f(str(a.Affiliation), str(row.Affiliation)))

    feature = name_l + aff_l
    return feature
Exemple #8
0
def score_of_given_name_and_meta(first4jaro,
                                 list4jaro,
                                 name1,
                                 name2,
                                 factor=0.9):
    '''
    This function will take the maximum score between the direct comparison of the name and the phonetic comparison
    '''
    #Important to check that we are not receiving empty files...
    if (max(len(name1), len(name2))) == 0: return 0.0
    #Jaro is creating odd situations with names which are very different in length, with this modification, we penalize lenght differences a lot
    len_factor = (abs((len(name1) - len(name2))) / max(len(name1), len(name2)))
    score_compare = textdistance.JaroWinkler(winklerize=False,
                                             external=False)(name1, name2)
    score_met = get_jaro_to_list(first4jaro, list4jaro, factor=factor)
    if (len_factor < 0.33) or (1 - len_factor) * (1 - len_factor) > max(
            score_met, score_compare * score_compare):
        return max(score_met, score_compare * score_compare)
    #We undo only in case this new scoring is more negative
    else:
        return (1 - len_factor) * (1 - len_factor)
Exemple #9
0
def restruct_fault(paper):
    import textdistance
    jws = textdistance.JaroWinkler()

    qtm = {}
    from copy import deepcopy
    gs = deepcopy(paper.content_object.get('groups'))
    nums = []
    for g in gs:
        for q in g.get('questions'):
            q['group'] = dict(title=g.get('title'), memo=g.get('memo'), number=g.get('number'))
            qtm[q.get('title')] = q
            nums.append(q.get('number'))
    models.Fault.objects.filter(paper=paper).exclude(question_id__in=nums).update(is_active=False)
    bm = {True: [], False: []}
    for f in paper.faults.all():
        q = f.question
        t1 = q.get('title')
        tp1 = t1.split('题】')[-1]
        qn = None
        mdl = 0.8
        for t2 in qtm.keys():
            tp2 = t2.split('题】')[-1]
            dl = jws.similarity(tp1, tp2)
            if dl > mdl:
                qn = qtm.get(t2)
                mdl = dl
        if not qn:
            bm[False].append(f.id)
            f.is_active = False
            f.save()
            continue
        t2 = qn.get('title')
        if t1 != t2 or q.get('options') != qn.get('options') or q.get('answer') != qn.get('answer') or q.get('explanation') != qn.get('explanation'):
            bm[True].append(f.id)
            f.question = qn
            f.save()
    return bm
 def jarowinkler(string1, string2):
     jaro = textdistance.JaroWinkler()
     d = jaro.normalized_distance(string1, string2)
     return d
Exemple #11
0
    def __init__(self, pa_preprocessor, name, qval=1):
        super().__init__(pa_preprocessor)

        self.time_log = []
        self.qval = qval
        self.textdistance_name = name

        # Edited based:
        if name == 'Hamming':
            self.similar_measure = textdistance.Hamming(qval=qval)
        elif name == 'DamerauLevenshtein':
            self.similar_measure = textdistance.DamerauLevenshtein(qval=qval)
        elif name == 'Levenshtein':
            self.similar_measure = textdistance.Levenshtein(qval=qval)
        elif name == 'Mlipns':
            self.similar_measure = textdistance.MLIPNS(qval=qval)
        elif name == 'Jaro':
            self.similar_measure = textdistance.Jaro(qval=qval)
        elif name == 'JaroWinkler':
            self.similar_measure = textdistance.JaroWinkler(qval=qval)
        elif name == 'StrCmp95':
            self.similar_measure = textdistance.StrCmp95()
        elif name == 'NeedlemanWunsch':
            self.similar_measure = textdistance.NeedlemanWunsch(qval=qval)
        elif name == 'Gotoh':
            self.similar_measure = textdistance.Gotoh(qval=qval)
        elif name == 'SmithWaterman':
            self.similar_measure = textdistance.SmithWaterman(qval=qval)

        # Token based
        elif name == 'Jaccard':
            self.similar_measure = textdistance.Jaccard(qval=qval)
        elif name == 'Sorensen':
            self.similar_measure = textdistance.Sorensen(qval=qval)
        elif name == 'Tversky':
            self.similar_measure = textdistance.Tversky()
        elif name == 'Overlap':
            self.similar_measure = textdistance.Overlap(qval=qval)
        elif name == 'Tanimoto':
            self.similar_measure = textdistance.Tanimoto(qval=qval)
        elif name == 'Cosine':
            self.similar_measure = textdistance.Cosine(qval=qval)
        elif name == 'MongeElkan':
            self.similar_measure = textdistance.MongeElkan(qval=qval)
        elif name == 'Bag':
            self.similar_measure = textdistance.Bag(qval=qval)

        # Sequence based
        elif name == 'LCSSeq':
            self.similar_measure = textdistance.LCSSeq(qval=qval)
        elif name == 'LCSStr':
            self.similar_measure = textdistance.LCSStr(qval=qval)
        elif name == 'RatcliffObershelp':
            self.similar_measure = textdistance.RatcliffObershelp(qval=qval)

        # Compression based
        elif name == 'ArithNCD':
            self.similar_measure = textdistance.ArithNCD(qval=qval)
        elif name == 'RLENCD':
            self.similar_measure = textdistance.RLENCD(qval=qval)
        elif name == 'BWTRLENCD':
            self.similar_measure = textdistance.BWTRLENCD()
        elif name == 'SqrtNCD':
            self.similar_measure = textdistance.SqrtNCD(qval=qval)
        elif name == 'EntropyNCD':
            self.similar_measure = textdistance.EntropyNCD(qval=qval)

        # Simple:
        elif name == 'Prefix':
            self.similar_measure = textdistance.Prefix(qval=qval)
        elif name == 'Postfix':
            self.similar_measure = textdistance.Postfix(qval=qval)
        elif name == 'Length':
            self.similar_measure = textdistance.Length(qval=qval)
        elif name == 'Identity':
            self.similar_measure = textdistance.Identity(qval=qval)
        elif name == 'Matrix':
            self.similar_measure = textdistance.Matrix()
Exemple #12
0
def simple_example():
    str1, str2 = 'test', 'text'
    qval = 2

    #--------------------
    # Edit-based.
    if True:
        print("textdistance.hamming({}, {}) = {}.".format(
            str1, str2, textdistance.hamming(str1, str2)))
        print("textdistance.hamming.distance({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.distance(str1, str2)))
        print("textdistance.hamming.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.similarity(str1, str2)))
        print("textdistance.hamming.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.normalized_distance(str1, str2)))
        print(
            "textdistance.hamming.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.hamming.normalized_similarity(str1, str2)))
        print(
            "textdistance.Hamming(qval={}, test_func=None, truncate=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Hamming(qval=qval,
                                     test_func=None,
                                     truncate=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.mlipns({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns(str1, str2)))
        print("textdistance.mlipns.distance({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.distance(str1, str2)))
        print("textdistance.mlipns.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.similarity(str1, str2)))
        print("textdistance.mlipns.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.normalized_distance(str1, str2)))
        print("textdistance.mlipns.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.normalized_similarity(str1, str2)))
        print(
            "textdistance.MLIPNS(threshold=0.25, maxmismatches=2, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.MLIPNS(threshold=0.25,
                                    maxmismatches=2,
                                    qval=qval,
                                    external=True).distance(str1, str2)))

        print("textdistance.levenshtein({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein(str1, str2)))
        print("textdistance.levenshtein.distance({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein.distance(str1, str2)))
        print("textdistance.levenshtein.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein.similarity(str1, str2)))
        print("textdistance.levenshtein.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.levenshtein.normalized_distance(str1, str2)))
        print("textdistance.levenshtein.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.levenshtein.normalized_similarity(str1, str2)))
        print(
            "textdistance.Levenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Levenshtein(qval=qval,
                                         test_func=None,
                                         external=True).distance(str1, str2)))

        print("textdistance.damerau_levenshtein({}, {}) = {}.".format(
            str1, str2, textdistance.damerau_levenshtein(str1, str2)))
        print("textdistance.damerau_levenshtein.distance({}, {}) = {}.".format(
            str1, str2, textdistance.damerau_levenshtein.distance(str1, str2)))
        print(
            "textdistance.damerau_levenshtein.similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.damerau_levenshtein.similarity(str1, str2)))
        print(
            "textdistance.damerau_levenshtein.normalized_distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.damerau_levenshtein.normalized_distance(
                    str1, str2)))
        print(
            "textdistance.damerau_levenshtein.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.damerau_levenshtein.normalized_similarity(
                    str1, str2)))
        print(
            "textdistance.DamerauLevenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.DamerauLevenshtein(qval=qval,
                                                test_func=None,
                                                external=True).distance(
                                                    str1, str2)))

        print("textdistance.jaro({}, {}) = {}.".format(
            str1, str2, textdistance.jaro(str1, str2)))
        print("textdistance.jaro.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.distance(str1, str2)))
        print("textdistance.jaro.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.similarity(str1, str2)))
        print("textdistance.jaro.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.normalized_distance(str1, str2)))
        print("textdistance.jaro.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.normalized_similarity(str1, str2)))
        print(
            "textdistance.Jaro(long_tolerance=False, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Jaro(long_tolerance=False,
                                  qval=qval,
                                  external=True).distance(str1, str2)))

        print("textdistance.jaro_winkler({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler(str1, str2)))
        print("textdistance.jaro_winkler.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler.distance(str1, str2)))
        print("textdistance.jaro_winkler.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler.similarity(str1, str2)))
        print("textdistance.jaro_winkler.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.jaro_winkler.normalized_distance(str1,
                                                                   str2)))
        print("textdistance.jaro_winkler.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.jaro_winkler.normalized_similarity(str1, str2)))
        print(
            "textdistance.JaroWinkler(long_tolerance=False, winklerize=True, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.JaroWinkler(long_tolerance=False,
                                         winklerize=True,
                                         qval=qval,
                                         external=True).distance(str1, str2)))

        print("textdistance.strcmp95({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95(str1, str2)))
        print("textdistance.strcmp95.distance({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.distance(str1, str2)))
        print("textdistance.strcmp95.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.similarity(str1, str2)))
        print("textdistance.strcmp95.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.normalized_distance(str1, str2)))
        print(
            "textdistance.strcmp95.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.strcmp95.normalized_similarity(str1, str2)))
        print(
            "textdistance.StrCmp95(long_strings=False, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.StrCmp95(long_strings=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.needleman_wunsch({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch(str1, str2)))
        print("textdistance.needleman_wunsch.distance({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch.distance(str1, str2)))
        print("textdistance.needleman_wunsch.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch.similarity(str1, str2)))
        print(
            "textdistance.needleman_wunsch.normalized_distance({}, {}) = {}.".
            format(
                str1, str2,
                textdistance.needleman_wunsch.normalized_distance(str1, str2)))
        print(
            "textdistance.needleman_wunsch.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.needleman_wunsch.normalized_similarity(
                    str1, str2)))
        print(
            "textdistance.NeedlemanWunsch(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.NeedlemanWunsch(gap_cost=1.0,
                                             sim_func=None,
                                             qval=qval,
                                             external=True).distance(
                                                 str1, str2)))

        print("textdistance.gotoh({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh(str1, str2)))
        print("textdistance.gotoh.distance({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.distance(str1, str2)))
        print("textdistance.gotoh.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.similarity(str1, str2)))
        print("textdistance.gotoh.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.normalized_distance(str1, str2)))
        print("textdistance.gotoh.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.normalized_similarity(str1, str2)))
        print(
            "textdistance.Gotoh(gap_open=1, gap_ext=0.4, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Gotoh(gap_open=1,
                                   gap_ext=0.4,
                                   sim_func=None,
                                   qval=qval,
                                   external=True).distance(str1, str2)))

        print("textdistance.smith_waterman({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman(str1, str2)))
        print("textdistance.smith_waterman.distance({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman.distance(str1, str2)))
        print("textdistance.smith_waterman.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman.similarity(str1, str2)))
        print("textdistance.smith_waterman.normalized_distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.smith_waterman.normalized_distance(str1, str2)))
        print(
            "textdistance.smith_waterman.normalized_similarity({}, {}) = {}.".
            format(
                str1, str2,
                textdistance.smith_waterman.normalized_similarity(str1, str2)))
        print(
            "textdistance.SmithWaterman(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.SmithWaterman(gap_cost=1.0,
                                           sim_func=None,
                                           qval=qval,
                                           external=True).distance(str1,
                                                                   str2)))

    #--------------------
    # Token-based.
    if False:
        print("textdistance.jaccard({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard(str1, str2)))
        print("textdistance.jaccard.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.distance(str1, str2)))
        print("textdistance.jaccard.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.similarity(str1, str2)))
        print("textdistance.jaccard.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.normalized_distance(str1, str2)))
        print(
            "textdistance.jaccard.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.jaccard.normalized_similarity(str1, str2)))
        print(
            "textdistance.Jaccard(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Jaccard(qval=qval, as_set=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.sorensen({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen(str1, str2)))
        print("textdistance.sorensen.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.distance(str1, str2)))
        print("textdistance.sorensen.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.similarity(str1, str2)))
        print("textdistance.sorensen.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.normalized_distance(str1, str2)))
        print(
            "textdistance.sorensen.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.sorensen.normalized_similarity(str1, str2)))
        print(
            "textdistance.Sorensen(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Sorensen(qval=qval, as_set=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.sorensen_dice({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice(str1, str2)))
        print("textdistance.sorensen_dice.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice.distance(str1, str2)))
        print("textdistance.sorensen_dice.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice.similarity(str1, str2)))
        print("textdistance.sorensen_dice.normalized_distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.sorensen_dice.normalized_distance(str1, str2)))
        print("textdistance.sorensen_dice.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.sorensen_dice.normalized_similarity(str1,
                                                                   str2)))
        #print("textdistance.SorensenDice().distance({}, {}) = {}.".format(str1, str2, textdistance.SorensenDice().distance(str1, str2)))

        print("textdistance.tversky({}, {}) = {}.".format(
            str1, str2, textdistance.tversky(str1, str2)))
        print("textdistance.tversky.distance({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.distance(str1, str2)))
        print("textdistance.tversky.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.similarity(str1, str2)))
        print("textdistance.tversky.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.normalized_distance(str1, str2)))
        print(
            "textdistance.tversky.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.tversky.normalized_similarity(str1, str2)))
        print(
            "textdistance.Tversky(qval={}, ks=None, bias=None, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Tversky(qval=qval,
                                     ks=None,
                                     bias=None,
                                     as_set=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.overlap({}, {}) = {}.".format(
            str1, str2, textdistance.overlap(str1, str2)))
        print("textdistance.overlap.distance({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.distance(str1, str2)))
        print("textdistance.overlap.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.similarity(str1, str2)))
        print("textdistance.overlap.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.normalized_distance(str1, str2)))
        print(
            "textdistance.overlap.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.overlap.normalized_similarity(str1, str2)))
        print(
            "textdistance.Overlap(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Overlap(qval=qval, as_set=False,
                                     external=True).distance(str1, str2)))

        # This is identical to the Jaccard similarity coefficient and the Tversky index for alpha=1 and beta=1.
        print("textdistance.tanimoto({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto(str1, str2)))
        print("textdistance.tanimoto.distance({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.distance(str1, str2)))
        print("textdistance.tanimoto.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.similarity(str1, str2)))
        print("textdistance.tanimoto.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.normalized_distance(str1, str2)))
        print(
            "textdistance.tanimoto.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.tanimoto.normalized_similarity(str1, str2)))
        print(
            "textdistance.Tanimoto(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Tanimoto(qval=qval, as_set=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.cosine({}, {}) = {}.".format(
            str1, str2, textdistance.cosine(str1, str2)))
        print("textdistance.cosine.distance({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.distance(str1, str2)))
        print("textdistance.cosine.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.similarity(str1, str2)))
        print("textdistance.cosine.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.normalized_distance(str1, str2)))
        print("textdistance.cosine.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.normalized_similarity(str1, str2)))
        print(
            "textdistance.Cosine(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Cosine(qval=qval, as_set=False,
                                    external=True).distance(str1, str2)))

        print("textdistance.monge_elkan({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan(str1, str2)))
        print("textdistance.monge_elkan.distance({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan.distance(str1, str2)))
        print("textdistance.monge_elkan.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan.similarity(str1, str2)))
        print("textdistance.monge_elkan.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.monge_elkan.normalized_distance(str1, str2)))
        print("textdistance.monge_elkan.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.monge_elkan.normalized_similarity(str1, str2)))
        print(
            "textdistance.MongeElkan(algorithm=textdistance.DamerauLevenshtein(), symmetric=False, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.MongeElkan(
                    algorithm=textdistance.DamerauLevenshtein(),
                    symmetric=False,
                    qval=qval,
                    external=True).distance(str1, str2)))

        print("textdistance.bag({}, {}) = {}.".format(
            str1, str2, textdistance.bag(str1, str2)))
        print("textdistance.bag.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bag.distance(str1, str2)))
        print("textdistance.bag.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bag.similarity(str1, str2)))
        print("textdistance.bag.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.bag.normalized_distance(str1, str2)))
        print("textdistance.bag.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bag.normalized_similarity(str1, str2)))
        print("textdistance.Bag(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.Bag(qval=qval).distance(str1, str2)))

    #--------------------
    # Sequence-based.
    if False:
        print("textdistance.lcsseq({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq(str1, str2)))
        print("textdistance.lcsseq.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.distance(str1, str2)))
        print("textdistance.lcsseq.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.similarity(str1, str2)))
        print("textdistance.lcsseq.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.normalized_distance(str1, str2)))
        print("textdistance.lcsseq.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.normalized_similarity(str1, str2)))
        #print("textdistance.LCSSeq(qval={}, test_func=None, external=True).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.LCSSeq(qval=qval, test_func=None, external=True).distance(str1, str2)))
        print("textdistance.LCSSeq().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.LCSSeq().distance(str1, str2)))

        print("textdistance.lcsstr({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr(str1, str2)))
        print("textdistance.lcsstr.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.distance(str1, str2)))
        print("textdistance.lcsstr.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.similarity(str1, str2)))
        print("textdistance.lcsstr.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.normalized_distance(str1, str2)))
        print("textdistance.lcsstr.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.normalized_similarity(str1, str2)))
        print("textdistance.LCSStr(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.LCSStr(qval=qval).distance(str1, str2)))

        print("textdistance.ratcliff_obershelp({}, {}) = {}.".format(
            str1, str2, textdistance.ratcliff_obershelp(str1, str2)))
        print("textdistance.ratcliff_obershelp.distance({}, {}) = {}.".format(
            str1, str2, textdistance.ratcliff_obershelp.distance(str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.ratcliff_obershelp.similarity(str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.normalized_distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.ratcliff_obershelp.normalized_distance(
                    str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.ratcliff_obershelp.normalized_similarity(
                    str1, str2)))
        print("textdistance.RatcliffObershelp().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.RatcliffObershelp().distance(str1, str2)))

    #--------------------
    # Compression-based.
    if False:
        print("textdistance.arith_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd(str1, str2)))
        print("textdistance.arith_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd.distance(str1, str2)))
        print("textdistance.arith_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd.similarity(str1, str2)))
        print(
            "textdistance.arith_ncd.normalized_distance({}, {}) = {}.".format(
                str1, str2,
                textdistance.arith_ncd.normalized_distance(str1, str2)))
        print("textdistance.arith_ncd.normalized_similarity({}, {}) = {}.".
              format(str1, str2,
                     textdistance.arith_ncd.normalized_similarity(str1, str2)))
        #print("textdistance.ArithNCD(base=2, terminator=None, qval={}).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.ArithNCD(base=2, terminator=None, qval=qval).distance(str1, str2)))
        print("textdistance.ArithNCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.ArithNCD().distance(str1, str2)))

        print("textdistance.rle_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd(str1, str2)))
        print("textdistance.rle_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.distance(str1, str2)))
        print("textdistance.rle_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.similarity(str1, str2)))
        print("textdistance.rle_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.rle_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.rle_ncd.normalized_similarity(str1, str2)))
        print("textdistance.RLENCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.RLENCD().distance(str1, str2)))

        print("textdistance.bwtrle_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd(str1, str2)))
        print("textdistance.bwtrle_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd.distance(str1, str2)))
        print("textdistance.bwtrle_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd.similarity(str1, str2)))
        print(
            "textdistance.bwtrle_ncd.normalized_distance({}, {}) = {}.".format(
                str1, str2,
                textdistance.bwtrle_ncd.normalized_distance(str1, str2)))
        print("textdistance.bwtrle_ncd.normalized_similarity({}, {}) = {}.".
              format(str1, str2,
                     textdistance.bwtrle_ncd.normalized_similarity(str1,
                                                                   str2)))
        print("textdistance.BWTRLENCD(terminator='\0').distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.BWTRLENCD(terminator='\0').distance(str1,
                                                                   str2)))

        print("textdistance.sqrt_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd(str1, str2)))
        print("textdistance.sqrt_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.distance(str1, str2)))
        print("textdistance.sqrt_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.similarity(str1, str2)))
        print("textdistance.sqrt_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.sqrt_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.sqrt_ncd.normalized_similarity(str1, str2)))
        print("textdistance.SqrtNCD(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.SqrtNCD(qval=qval).distance(str1, str2)))

        print("textdistance.entropy_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd(str1, str2)))
        print("textdistance.entropy_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd.distance(str1, str2)))
        print("textdistance.entropy_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd.similarity(str1, str2)))
        print("textdistance.entropy_ncd.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.entropy_ncd.normalized_distance(str1, str2)))
        print("textdistance.entropy_ncd.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.entropy_ncd.normalized_similarity(str1, str2)))
        print(
            "textdistance.EntropyNCD(qval={}, coef=1, base=2).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.EntropyNCD(qval=qval, coef=1,
                                        base=2).distance(str1, str2)))

        print("textdistance.bz2_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd(str1, str2)))
        print("textdistance.bz2_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.distance(str1, str2)))
        print("textdistance.bz2_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.similarity(str1, str2)))
        print("textdistance.bz2_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.bz2_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.bz2_ncd.normalized_similarity(str1, str2)))
        print("textdistance.BZ2NCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.BZ2NCD().distance(str1, str2)))

        print("textdistance.lzma_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd(str1, str2)))
        print("textdistance.lzma_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.distance(str1, str2)))
        print("textdistance.lzma_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.similarity(str1, str2)))
        print("textdistance.lzma_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.lzma_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.lzma_ncd.normalized_similarity(str1, str2)))
        print("textdistance.LZMANCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.LZMANCD().distance(str1, str2)))

        print("textdistance.zlib_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd(str1, str2)))
        print("textdistance.zlib_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.distance(str1, str2)))
        print("textdistance.zlib_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.similarity(str1, str2)))
        print("textdistance.zlib_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.zlib_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.zlib_ncd.normalized_similarity(str1, str2)))
        print("textdistance.ZLIBNCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.ZLIBNCD().distance(str1, str2)))

    #--------------------
    # Phonetic.
    if False:
        print("textdistance.mra({}, {}) = {}.".format(
            str1, str2, textdistance.mra(str1, str2)))
        print("textdistance.mra.distance({}, {}) = {}.".format(
            str1, str2, textdistance.mra.distance(str1, str2)))
        print("textdistance.mra.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mra.similarity(str1, str2)))
        print("textdistance.mra.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.mra.normalized_distance(str1, str2)))
        print("textdistance.mra.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mra.normalized_similarity(str1, str2)))
        print("textdistance.MRA().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.MRA().distance(str1, str2)))

        print("textdistance.editex({}, {}) = {}.".format(
            str1, str2, textdistance.editex(str1, str2)))
        print("textdistance.editex.distance({}, {}) = {}.".format(
            str1, str2, textdistance.editex.distance(str1, str2)))
        print("textdistance.editex.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.editex.similarity(str1, str2)))
        print("textdistance.editex.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.editex.normalized_distance(str1, str2)))
        print("textdistance.editex.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.editex.normalized_similarity(str1, str2)))
        print(
            "textdistance.Editex(local=False, match_cost=0, group_cost=1, mismatch_cost=2, groups=None, ungrouped=None, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.Editex(local=False,
                                    match_cost=0,
                                    group_cost=1,
                                    mismatch_cost=2,
                                    groups=None,
                                    ungrouped=None,
                                    external=True).distance(str1, str2)))

    #--------------------
    # Simple.
    if False:
        print("textdistance.prefix({}, {}) = {}.".format(
            str1, str2, textdistance.prefix(str1, str2)))
        print("textdistance.prefix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.distance(str1, str2)))
        print("textdistance.prefix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.similarity(str1, str2)))
        print("textdistance.prefix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.normalized_distance(str1, str2)))
        print("textdistance.prefix.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.normalized_similarity(str1, str2)))
        print(
            "textdistance.Prefix(qval={}, sim_test=None).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Prefix(qval=qval,
                                    sim_test=None).distance(str1, str2)))

        print("textdistance.postfix({}, {}) = {}.".format(
            str1, str2, textdistance.postfix(str1, str2)))
        print("textdistance.postfix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.distance(str1, str2)))
        print("textdistance.postfix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.similarity(str1, str2)))
        print("textdistance.postfix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.normalized_distance(str1, str2)))
        print(
            "textdistance.postfix.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.postfix.normalized_similarity(str1, str2)))
        #print("textdistance.Postfix(qval={}, sim_test=None).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.Postfix(qval=qval, sim_test=None).distance(str1, str2)))
        print("textdistance.Postfix().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Postfix().distance(str1, str2)))

        print("textdistance.length({}, {}) = {}.".format(
            str1, str2, textdistance.length(str1, str2)))
        print("textdistance.length.distance({}, {}) = {}.".format(
            str1, str2, textdistance.length.distance(str1, str2)))
        print("textdistance.length.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.length.similarity(str1, str2)))
        print("textdistance.length.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.length.normalized_distance(str1, str2)))
        print("textdistance.length.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.length.normalized_similarity(str1, str2)))
        print("textdistance.Length().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Length().distance(str1, str2)))

        print("textdistance.identity({}, {}) = {}.".format(
            str1, str2, textdistance.identity(str1, str2)))
        print("textdistance.identity.distance({}, {}) = {}.".format(
            str1, str2, textdistance.identity.distance(str1, str2)))
        print("textdistance.identity.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.identity.similarity(str1, str2)))
        print("textdistance.identity.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.identity.normalized_distance(str1, str2)))
        print(
            "textdistance.identity.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.identity.normalized_similarity(str1, str2)))
        print("textdistance.Identity().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Identity().distance(str1, str2)))

        print("textdistance.matrix({}, {}) = {}.".format(
            str1, str2, textdistance.matrix(str1, str2)))
        print("textdistance.matrix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.distance(str1, str2)))
        print("textdistance.matrix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.similarity(str1, str2)))
        print("textdistance.matrix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.normalized_distance(str1, str2)))
        print("textdistance.matrix.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.normalized_similarity(str1, str2)))
        print(
            "textdistance.Matrix(mat=None, mismatch_cost=0, match_cost=1, symmetric=True, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.Matrix(mat=None,
                                    mismatch_cost=0,
                                    match_cost=1,
                                    symmetric=True,
                                    external=True).distance(str1, str2)))
 def setup(self):
     # Called when the worker is created
     self._sorensen = textdistance.Sorensen(
         3)  # 3-grams for similarity calculation
     self._jaro_winkler = textdistance.JaroWinkler()
from flask import Flask, jsonify
import pandas as pd
import numpy as np
import textdistance

app = Flask(__name__)

df = pd.read_csv("precomputed_neighbors.csv", index_col=0)

# compute Jaro-Winkler distance calculation
JW = JW = textdistance.JaroWinkler(long_tolerance=True)

# vectorize the function for increase speed
vec_JW = np.vectorize(lambda a, b: JW.normalized_similarity(a, b))

def get_recommendations(movie_id):
    """
    Return 5 movies among the nearest 20. The final 5 at picked at random,
    based on their score (when the give movie's score > 5), and the squared
    euclidian distance.
    """

    if movie_id not in df.index:
        return {'error': 'The ID number is unknown'}

    series = df.loc[movie_id]

    title = series.movie_title
    score = series.imdb_score

    def s_split(string):
Exemple #15
0
def metrics(x):
    a = x[4].strip()
    b = x[5].strip()

    al = a.lower()
    bl = b.lower()

    a_len = float(len(a))

    def tryit(x):
        try:
            return x()
        except Exception as e:
            return 0.0

    tempo = lambda a, b, x: \
      sum([
        1 if xi == a else (-1 if xi == b else 0) for xi in x
      ])

    M = [
        x[3],
        tryit(lambda: td.bz2_ncd(a, b)),
        tryit(lambda: td.zlib_ncd(a, b)),
        tryit(lambda: td.prefix.normalized_similarity(a, b)),
        tryit(lambda: td.postfix.normalized_similarity(a, b)),
        tryit(lambda: td.matrix.normalized_similarity(a, b)),
        tryit(lambda: td.length.normalized_similarity(a, b)),
        tryit(lambda: td.Hamming().normalized_similarity(a, b)),
        tryit(lambda: td.Hamming(qval=2).normalized_similarity(a, b)),
        tryit(lambda: td.Hamming(qval=3).normalized_similarity(a, b)),
        tryit(lambda: td.Hamming(qval=4).normalized_similarity(a, b)),
        tryit(lambda: td.Hamming(qval=5).normalized_similarity(a, b)),
        tryit(lambda: td.DamerauLevenshtein().normalized_similarity(a, b)),
        tryit(
            lambda: td.DamerauLevenshtein(qval=2).normalized_similarity(a, b)),
        tryit(
            lambda: td.DamerauLevenshtein(qval=3).normalized_similarity(a, b)),
        tryit(
            lambda: td.DamerauLevenshtein(qval=4).normalized_similarity(a, b)),
        tryit(
            lambda: td.DamerauLevenshtein(qval=5).normalized_similarity(a, b)),
        tryit(lambda: td.Jaccard().normalized_similarity(a, b)),
        tryit(lambda: td.Jaccard().normalized_similarity(al, bl)),
        tryit(lambda: td.Jaccard(qval=2).normalized_similarity(a, b)),
        tryit(lambda: td.Jaccard(qval=2).normalized_similarity(al, bl)),
        tryit(lambda: td.Jaccard(qval=3).normalized_similarity(a, b)),
        tryit(lambda: td.Jaccard(qval=3).normalized_similarity(al, bl)),
        tryit(lambda: td.Jaccard(qval=4).normalized_similarity(a, b)),
        tryit(lambda: td.Jaccard(qval=4).normalized_similarity(al, bl)),
        tryit(lambda: td.Jaccard(qval=5).normalized_similarity(a, b)),
        tryit(lambda: td.Jaccard(qval=5).normalized_similarity(al, bl)),
        tryit(lambda: td.Tversky().normalized_similarity(a, b)),
        tryit(lambda: td.Tversky(qval=2).normalized_similarity(a, b)),
        tryit(lambda: td.Tversky(qval=3).normalized_similarity(a, b)),
        tryit(lambda: td.Tversky(qval=4).normalized_similarity(a, b)),
        tryit(lambda: td.Tversky(qval=5).normalized_similarity(a, b)),
        tryit(lambda: td.JaroWinkler().normalized_similarity(a, b)),
        tryit(lambda: td.JaroWinkler(qval=2).normalized_similarity(a, b)),
        tryit(lambda: td.JaroWinkler(qval=3).normalized_similarity(a, b)),
        tryit(lambda: td.JaroWinkler(qval=4).normalized_similarity(a, b)),
        tryit(lambda: td.JaroWinkler(qval=5).normalized_similarity(a, b)),
        tryit(lambda: td.StrCmp95().normalized_similarity(a, b)),
        tryit(lambda: td.StrCmp95().normalized_similarity(al, bl)),
        1.0 - (float(abs(tempo('(', ')', a) - tempo('(', ')', b))) / a_len),
        1.0 - (float(abs(tempo('[', ']', a) - tempo('[', ']', b))) / a_len),
        1.0 - (float(abs(tempo('{', '}', a) - tempo('{', '}', b))) / a_len),
        1.0 - (float(abs(tempo('<', '>', a) - tempo('<', '>', b))) / a_len)
    ]

    return '{} qid:{} {} # {}'.format(
        x[0], x[1], ' '.join(
            ['{}:{:.4f}'.format(k + 1, float(y)) for k, y in enumerate(M)]),
        x[2])
Exemple #16
0
def run(experiment):
    save_path = "checkpoints/" + experiment.name 
    log_path = "tensorboard/train/" + experiment.name
    # create or clean directory
    for path in [save_path, log_path]:
        if not os.path.exists(path):
            os.makedirs(path)
        else:
            shutil.rmtree(path)           
            os.makedirs(path)
    save_path += "/dev"

    # log git commit hash
    repo = git.Repo(search_parent_directories=True)
    sha = repo.head.object.hexsha
    file = open(log_path + "/git_commit_" + sha, 'w')
    file.close()

    epochs, input_batch_size, rnn_size, num_layers, encoding_embedding_size, decoding_embedding_size, learning_rate, keep_probability, num_samples, reward = map(experiment.hyperparams.get, ('epochs', 'input_batch_size', 'rnn_size', 'num_layers', 'encoding_embedding_size', 'decoding_embedding_size', 'learning_rate', 'keep_probability', 'num_samples', "reward"))
    
    ### prepare data ###
    (train_source_int_text, train_target_int_text), (valid_source_int_text, valid_target_int_text), (
            source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = data_preprocessing.get_data(experiment.data["dataset"], experiment.data["folder"], experiment.data["train_source_file"], experiment.data["train_target_file"], experiment.data["dev_source_file"], experiment.data["dev_target_file"], experiment.tokenization)

    max_source_sentence_length = max([len(sentence) for sentence in train_source_int_text])

    train_source = train_source_int_text
    train_target = train_target_int_text
    
    valid_source = valid_source_int_text
    valid_target = valid_target_int_text

    # shuffle
    rnd = random.Random(1234)
    train_combined = list(zip(train_source, train_target))
    rnd.shuffle(train_combined)
    train_source, train_target = zip(*train_combined)

    valid_combined = list(zip(valid_source, valid_target))
    rnd.shuffle(valid_combined)
    valid_source, valid_target = zip(*valid_combined)

    # set reward function
    if reward == "levenshtein":
        reward_func = lambda ref_hyp: - textdistance.levenshtein(ref_hyp[0], ref_hyp[1])   
    elif reward == "jaro-winkler":
        reward_func = lambda ref_hyp: textdistance.JaroWinkler()(ref_hyp[0], ref_hyp[1]) 
    elif reward == "hamming":
        reward_func = lambda ref_hyp: - textdistance.hamming(ref_hyp[0], ref_hyp[1])

    if experiment.train_method == 'MLE':
        graph_batch_size = input_batch_size
    elif experiment.train_method == 'reinforce' or experiment.train_method == 'reinforce_test':
        graph_batch_size = num_samples

    ### prepare model ###
    tf.reset_default_graph()# maybe need?
    with tf.variable_scope(tf.get_variable_scope(), reuse=False):
        model = rnn_model.RNN(graph_batch_size, max_source_sentence_length, source_vocab_to_int, target_vocab_to_int, encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers)
    
    eval_batch_size = 128
    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
        eval_model = rnn_model.RNN(eval_batch_size, max_source_sentence_length, source_vocab_to_int, target_vocab_to_int, encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers, False)


    early_stopping = True

    ### train model ###
    if experiment.train_method == 'reinforce_test':
        train.reinforce_test(model, experiment.start_checkpoint, source_vocab_to_int, learning_rate, keep_probability, graph_batch_size, target_int_to_vocab, source_int_to_vocab, valid_source, valid_target)
    else:
        train.train(experiment.name, experiment.train_method, model, epochs, input_batch_size, train_source, train_target, valid_source, valid_target, learning_rate, keep_probability, save_path, experiment.start_checkpoint, target_int_to_vocab, source_int_to_vocab, source_vocab_to_int, log_path, graph_batch_size, experiment.max_hours, eval_model, eval_batch_size, reward_func, early_stopping)