def __init__(self): self.similarity_function = [ sm.BagDistance(), sm.Cosine(), sm.Dice(), sm.Editex(), sm.GeneralizedJaccard(), sm.Jaccard(), sm.Jaro(), sm.JaroWinkler(), sm.Levenshtein(), sm.OverlapCoefficient(), sm.TverskyIndex() ] self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
df['TfIdf'] = df.apply( lambda x: tfidf.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # # Sequence Based Similarities # In[38]: aff = sm.Affine() df['Affine'] = df.apply( lambda x: aff.get_raw_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # In[39]: bd = sm.BagDistance() df['Bag'] = df.apply( lambda x: bd.get_sim_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # In[40]: ed = sm.Editex() df['Editex'] = df.apply( lambda x: ed.get_sim_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # In[41]: jaro = sm.Jaro() df['Jaro'] = df.apply(