def needleman_wunsch(s1, s2): """ This function computes the Needleman-Wunsch measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Needleman-Wunsch measure if both the strings are not missing (i.e NaN), else returns NaN. """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # if isinstance(s1, six.string_types): # s1 = gh.remove_non_ascii(s1) # if isinstance(s2, six.string_types): # s2 = gh.remove_non_ascii(s2) # Create the similarity measure object measure = sm.NeedlemanWunsch() if not (isinstance(s1, six.string_types) or isinstance(s1, bytes)): s1 = str(s1) if not (isinstance(s2, six.string_types) or isinstance(s2, bytes)): s2 = str(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
def needleman_wunsch(s1, s2): if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN s1 = helper.convert_to_str_unicode(s1) s2 = helper.convert_to_str_unicode(s2) measure = sm.NeedlemanWunsch() return measure.get_raw_score(s1, s2)
def needleman_wunsch(s1, s2): if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # if isinstance(s1, six.string_types): # s1 = gh.remove_non_ascii(s1) # if isinstance(s2, six.string_types): # s2 = gh.remove_non_ascii(s2) # Create the similarity measure object measure = sm.NeedlemanWunsch() if not(isinstance(s1, six.string_types) or isinstance(s1, bytes)): s1 = str(s1) if not(isinstance(s2, six.string_types) or isinstance(s2, bytes)): s2 = str(s2) # Call the function to compute the similarity measure return nw_norm(s1, s2)
def needleman_wunsch(s1, s2): """ This function computes the Needleman-Wunsch measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Needleman-Wunsch measure if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.needleman_wunsch('dva', 'deeva') 1.0 >>> em.needleman_wunsch('dva', None) nan """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # Create the similarity measure object measure = sm.NeedlemanWunsch() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
jaro = sm.Jaro() df['Jaro'] = df.apply( lambda x: jaro.get_sim_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # In[42]: lev = sm.Levenshtein() df['Levenshtein'] = df.apply( lambda x: lev.get_sim_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # In[43]: nw = sm.NeedlemanWunsch() df['NeedlemanWunsch'] = df.apply( lambda x: nw.get_raw_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # In[44]: sw = sm.SmithWaterman() df['SmithWaterman'] = df.apply( lambda x: sw.get_raw_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # ### The following data is used to quickly establish the performance of models (NOT THE FINAL CODE) # In[45]:
True) jac = FVC.stringMatchExcerpts('Jacc', sm.Jaccard(), sm.WhitespaceTokenizer(return_set=True)) jacq3 = FVC.stringMatchExcerpts('FuzzJacc', sm.Jaccard(), sm.QgramTokenizer(qval=3, return_set=True)) dice = FVC.stringMatchExcerpts('Dice', sm.Dice(), sm.WhitespaceTokenizer(return_set=True)) diceq3 = FVC.stringMatchExcerpts('Dice', sm.Dice(), sm.QgramTokenizer(qval=3, return_set=True)) cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(), sm.WhitespaceTokenizer(return_set=True)) cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure', sm.Cosine(), sm.QgramTokenizer(return_set=True)) LVdist = FVC.stringMatchTitles('LVDist', sm.Levenshtein()) sw = FVC.stringMatchTitles('SW', sm.SmithWaterman()) nw = FVC.stringMatchTitles('NW', sm.NeedlemanWunsch()) jw = FVC.stringMatchTitles('JW', sm.JaroWinkler()) def writeToCSV(fileName, header, tableList): wr = csv.writer(open(fileName, 'wb'), quoting=csv.QUOTE_ALL) wr.writerow(header) for row in tableList: wr.writerow(row) # Given a set of feature vector components, records precision and recall over several # classifiers. Records output to a table and vertical bar plot. def modelExperiment(insampleData,