def overlap_coeff(arr1, arr2): """ This function computes the overlap coefficient between the two input lists/sets. Args: arr1,arr2 (list or set): The input lists or sets for which the overlap coefficient should be computed. Returns: The overlap coefficient if both the lists/sets are not None and do not have any missing tokens (i.e NaN), else returns NaN. """ if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create overlap coefficient measure object measure = sm.OverlapCoefficient() # Call the function to return the overlap coefficient return measure.get_raw_score(arr1, arr2)
def matchHeaders(headers): jac = sm.Jaccard() lev = sm.Levenshtein() oc = sm.OverlapCoefficient() i = 0 j = 0 header_len = len(headers) for i in range(0, header_len - 1): for first in headers[i]: j = i + 1 if j == header_len: break for second in headers[j]: # print(first, '' , second, '') # i = i + 1 # if(i == header_len): # continue x = first y = second delim_tok = sm.DelimiterTokenizer(delim_set=['_']) jacScore = jac.get_sim_score(delim_tok.tokenize(x), delim_tok.tokenize(y)) levScore = lev.get_sim_score(x, y) ocScore = oc.get_sim_score(delim_tok.tokenize(x), delim_tok.tokenize(y)) if (ocScore == 1 or levScore >= 0.5 or jacScore >= 0.5): print(first + ' of Table' + str(i + 1) + ' and ' + second + ' of Table' + str(j + 1) + ' matched')
def get_similarity(s1, s2): #TODO add lematization/stemming and tokenization using (spaCy|nltk|gensim) oc = sm.OverlapCoefficient() list1 = simple_preprocess(s1, deacc=True, min_len=1, max_len=25) list2 = simple_preprocess(s2, deacc=True, min_len=1, max_len=25) # print(list1) # print(list2) return oc.get_raw_score(list1, list2)
def overlap_coeff(arr1, arr2): if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create overlap coefficient measure object measure = sm.OverlapCoefficient() # Call the function to return the overlap coefficient return measure.get_raw_score(arr1, arr2)
def __init__(self): self.similarity_function = [ sm.BagDistance(), sm.Cosine(), sm.Dice(), sm.Editex(), sm.GeneralizedJaccard(), sm.Jaccard(), sm.Jaro(), sm.JaroWinkler(), sm.Levenshtein(), sm.OverlapCoefficient(), sm.TverskyIndex() ] self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
def overlap_coeff(arr1, arr2): """ This function computes the overlap coefficient between the two input lists/sets. Args: arr1,arr2 (list or set): The input lists or sets for which the overlap coefficient should be computed. Returns: The overlap coefficient if both the lists/sets are not None and do not have any missing tokens (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.overlap_coeff(['data', 'science'], ['data']) 1.0 >>> em.overlap_coeff(['data', 'science'], None) nan """ #print "arr1:", arr1 #print "arr2:", arr2 if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create overlap coefficient measure object measure = sm.OverlapCoefficient() # Call the function to return the overlap coefficient return measure.get_raw_score(arr1, arr2)
cos = sm.Cosine() df['Cosine'] = df.apply( lambda x: cos.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # In[32]: dice = sm.Dice() df['Dice'] = df.apply(lambda x: dice.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # In[33]: oc = sm.OverlapCoefficient() df['Overlap'] = df.apply( lambda x: oc.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # In[34]: # Set alpha beta https://en.wikipedia.org/wiki/Tversky_index # Setting alpha beta as 0.5 is same as Dice Similarity tvi = sm.TverskyIndex(0.3, 0.6) df['Tversky'] = df.apply( lambda x: tvi.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # In[35]: