Ejemplo n.º 1
0
def overlap_coeff(arr1, arr2):
    """
    This function computes the overlap coefficient between the two input
    lists/sets.

    Args:
        arr1,arr2 (list or set): The input lists or sets for which the overlap
            coefficient should be computed.

    Returns:
        The overlap coefficient if both the lists/sets are not None and do not
        have any missing tokens (i.e NaN), else  returns NaN.
    """

    if arr1 is None or arr2 is None:
        return pd.np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return pd.np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return pd.np.NaN
    # Create overlap coefficient measure object
    measure = sm.OverlapCoefficient()
    # Call the function to return the overlap coefficient
    return measure.get_raw_score(arr1, arr2)
Ejemplo n.º 2
0
def matchHeaders(headers):
    jac = sm.Jaccard()
    lev = sm.Levenshtein()
    oc = sm.OverlapCoefficient()

    i = 0
    j = 0

    header_len = len(headers)

    for i in range(0, header_len - 1):
        for first in headers[i]:
            j = i + 1
            if j == header_len:
                break
            for second in headers[j]:
                #                print(first, '' , second, '')
                #        i = i + 1
                #        if(i == header_len):
                #           continue
                x = first
                y = second
                delim_tok = sm.DelimiterTokenizer(delim_set=['_'])
                jacScore = jac.get_sim_score(delim_tok.tokenize(x),
                                             delim_tok.tokenize(y))
                levScore = lev.get_sim_score(x, y)
                ocScore = oc.get_sim_score(delim_tok.tokenize(x),
                                           delim_tok.tokenize(y))

                if (ocScore == 1 or levScore >= 0.5 or jacScore >= 0.5):
                    print(first + ' of Table' + str(i + 1) + ' and ' + second +
                          ' of Table' + str(j + 1) + ' matched')
Ejemplo n.º 3
0
def get_similarity(s1, s2):
    #TODO add lematization/stemming and tokenization using (spaCy|nltk|gensim)
    oc = sm.OverlapCoefficient()
    list1 = simple_preprocess(s1, deacc=True, min_len=1, max_len=25)
    list2 = simple_preprocess(s2, deacc=True, min_len=1, max_len=25)
    # print(list1)
    # print(list2)
    return oc.get_raw_score(list1, list2)
Ejemplo n.º 4
0
def overlap_coeff(arr1, arr2):
    if arr1 is None or arr2 is None:
        return pd.np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return pd.np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return pd.np.NaN
    # Create overlap coefficient measure object
    measure = sm.OverlapCoefficient()
    # Call the function to return the overlap coefficient
    return measure.get_raw_score(arr1, arr2)
Ejemplo n.º 5
0
    def __init__(self):
        self.similarity_function = [
            sm.BagDistance(),
            sm.Cosine(),
            sm.Dice(),
            sm.Editex(),
            sm.GeneralizedJaccard(),
            sm.Jaccard(),
            sm.Jaro(),
            sm.JaroWinkler(),
            sm.Levenshtein(),
            sm.OverlapCoefficient(),
            sm.TverskyIndex()
        ]

        self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
Ejemplo n.º 6
0
def overlap_coeff(arr1, arr2):
    """
    This function computes the overlap coefficient between the two input
    lists/sets.

    Args:
        arr1,arr2 (list or set): The input lists or sets for which the overlap
            coefficient should be computed.

    Returns:
        The overlap coefficient if both the lists/sets are not None and do not
        have any missing tokens (i.e NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.overlap_coeff(['data', 'science'], ['data'])
        1.0
        >>> em.overlap_coeff(['data', 'science'], None)
        nan

    """

    #print "arr1:", arr1
    #print "arr2:", arr2
    if arr1 is None or arr2 is None:
        return pd.np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return pd.np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return pd.np.NaN
    # Create overlap coefficient measure object
    measure = sm.OverlapCoefficient()
    # Call the function to return the overlap coefficient
    return measure.get_raw_score(arr1, arr2)
Ejemplo n.º 7
0
cos = sm.Cosine()
df['Cosine'] = df.apply(
    lambda x: cos.get_sim_score(x['aTokens'], x['bTokens']), axis=1)
df.head()

# In[32]:

dice = sm.Dice()
df['Dice'] = df.apply(lambda x: dice.get_sim_score(x['aTokens'], x['bTokens']),
                      axis=1)
df.head()

# In[33]:

oc = sm.OverlapCoefficient()
df['Overlap'] = df.apply(
    lambda x: oc.get_sim_score(x['aTokens'], x['bTokens']), axis=1)
df.head()

# In[34]:

# Set alpha beta https://en.wikipedia.org/wiki/Tversky_index
# Setting alpha beta as 0.5 is same as Dice Similarity
tvi = sm.TverskyIndex(0.3, 0.6)
df['Tversky'] = df.apply(
    lambda x: tvi.get_sim_score(x['aTokens'], x['bTokens']), axis=1)
df.head()

# In[35]: