Exemple #1
0
def syntactic_sim(col1: Column, col2: Column) -> float:
    text1 = "||".join(col1.values)
    text2 = "||".join(col2.values)

    token_lists1 = [x.name for x in Token.get_basic_tokens(text1)]
    token_lists2 = [x.name for x in Token.get_basic_tokens(text2)]

    return jaccard(token_lists1, token_lists2)
Exemple #2
0
def length_syntactic_sim(col1: Column, col2: Column) -> float:
    token_lists1 = []
    for str_value in col1.values:
        token_lists1.append(" ".join([
            "%s(%s)" % (x.token_type.name, x.length)
            for x in Token.get_basic_pattern(str_value)
        ]))

    token_lists2 = []
    for str_value in col2.values:
        token_lists2.append(" ".join([
            "%s(%s)" % (x.token_type.name, x.length)
            for x in Token.get_basic_pattern(str_value)
        ]))

    return jaccard(token_lists1, token_lists2)
Exemple #3
0
def token_jaccard(col1: Column, col2: Column) -> float:
    tokens1 = [x for value in col1.values for x in value.split()]
    tokens2 = [x for value in col2.values for x in value.split()]
    return jaccard(tokens1, tokens2)
Exemple #4
0
def values_jaccard(col1: Column, col2: Column) -> float:
    return jaccard(col1.values, col2.values)
Exemple #5
0
def ngram_jaccard(col1: Column, col2: Column, n: int) -> float:
    n_grams1 = [x for value in col1.values for x in create_ngrams(value, n)]
    n_grams2 = [x for value in col2.values for x in create_ngrams(value, n)]

    return jaccard(n_grams1, n_grams2)
Exemple #6
0
def text_jaccard(col1: Column, col2: Column) -> float:
    col1_array = np.array(col1.text().split(" "))
    col2_array = np.array(col2.text().split(" "))

    return jaccard(col1_array, col2_array)
Exemple #7
0
def name_jaccard(col1: Column, col2: Column) -> float:
    n_grams1 = np.array(create_ngrams(col1.name, 2))
    n_grams2 = np.array(create_ngrams(col2.name, 2))
    return jaccard(n_grams1, n_grams2)