def smith_waterman(s1, s2): """ This function computes the Smith-Waterman measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Smith-Waterman measure if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.smith_waterman('cat', 'hat') 2.0 >>> em.smith_waterman('cat', None) nan """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # Create the similarity measure object measure = sm.SmithWaterman() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
def jaro_winkler(s1, s2): """ This function computes the Jaro Winkler measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Jaro Winkler measure if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.jaro_winkler('MARTHA', 'MARHTA') 0.9611111111111111 >>> >>> em.jaro_winkler('MARTHA', None) nan """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # Create the similarity measure object measure = sm.JaroWinkler() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
def lev_sim(s1, s2): """ This function computes the Levenshtein similarity between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Levenshtein similarity if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.lev_sim('alex', 'alxe') 0.5 >>> em.lev_dist(None, 'alex') nan """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # Create the similarity measure object measure = sm.Levenshtein() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_sim_score(s1, s2)
def tok_alphanumeric(input_string): """ This function returns a list of tokens that are maximal sequences of consecutive alphanumeric characters. Args: input_string (string): Input string that should be tokenized. Returns: A list of tokens, if the input string is not NaN , else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_alphanumeric('data9,(science), data9#.(integration).88') ['data9', 'science', 'data9', 'integration', '88'] >>> em.tok_alphanumeric('#.$') [] >>> em.tok_alphanumeric(None) nan """ if pd.isnull(input_string): return pd.np.NaN input_string = gh.convert_to_str_unicode(input_string) measure = sm.AlphanumericTokenizer() return measure.tokenize(input_string)
def tok_wspace(input_string): """ This function splits the input string into a list of tokens (based on the white space). Args: input_string (string): Input string that should be tokenized. Returns: A list of tokens, if the input string is not NaN , else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_wspace('data science') ['data', 'science'] >>> em.tok_wspace('data science') ['data', 'science'] >>> em.tok_wspace(None) nan """ if pd.isnull(input_string): return pd.np.NaN # input_string = remove_non_ascii(input_string) input_string = gh.convert_to_str_unicode(input_string) measure = sm.WhitespaceTokenizer() return measure.tokenize(input_string)
def tok_delim(input_string, d): """ This function splits the input string into a list of tokens (based on the delimiter). Args: input_string (string): Input string that should be tokenized. d (string): Delimiter string. Returns: A list of tokens, if the input string is not NaN , else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_delim('data science', ' ') ['data', 'science'] >>> em.tok_delim('data$#$science', '$#$') ['data', 'science'] >>> em.tok_delim(None, ' ') nan """ if pd.isnull(input_string): return pd.np.NaN input_string = gh.convert_to_str_unicode(input_string) measure = sm.DelimiterTokenizer(delim_set=[d]) return measure.tokenize(input_string)
def tok_qgram(input_string, q): """ This function splits the input string into a list of q-grams. Note that, by default the input strings are padded and then tokenized. Args: input_string (string): Input string that should be tokenized. q (int): q-val that should be used to tokenize the input string. Returns: A list of tokens, if the input string is not NaN, else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_qgram('database', q=2) ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$'] >>> em.tok_qgram('database', q=3) ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$'] >>> em.tok_qgram(None, q=2) nan """ if pd.isnull(input_string): return pd.np.NaN input_string = gh.convert_to_str_unicode(input_string) measure = sm.QgramTokenizer(qval=q) return measure.tokenize(input_string)
def tok_qgram(s): # check if the input is of type base string if pd.isnull(s): return s s = gh.convert_to_str_unicode(s) measure = sm.QgramTokenizer(qval=q) return measure.tokenize(s)
def affine(s1, s2): """ This function computes the affine measure between the two input strings. Args: s1,s2 (string ): The input strings for which the similarity measure should be computed. Returns: The affine measure if both the strings are not missing (i.e NaN or None), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.affine('dva', 'deeva') 1.5 >>> em.affine(None, 'deeva') nan """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # Create the similarity measure object measure = sm.Affine() # if not isinstance(s1, six.string_types): # s1 = six.u(str(s1)) # # if isinstance(s1, bytes): # s1 = s1.decode('utf-8', 'ignore') # # if not isinstance(s2, six.string_types): # s2 = six.u(str(s2)) # # if isinstance(s2, bytes): # s2 = s2.decode('utf-8', 'ignore') s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity return measure.get_raw_score(s1, s2)
def tok_delim(s): # check if the input is of type base string if pd.isnull(s): return s # Remove non ascii characters. Note: This should be fixed in the # next version. #s = remove_non_ascii(s) s = gh.convert_to_str_unicode(s) # Initialize the tokenizer measure object measure = sm.DelimiterTokenizer(delim_set=[d]) # Call the function that will tokenize the input string. return measure.tokenize(s)
def hamming_dist(s1, s2): """ This function computes the Hamming distance between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Hamming distance if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.hamming_dist('alex', 'john') 4 >>> em.hamming_dist(None, 'john') nan """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # Create the similarity measure object measure = sm.HammingDistance() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the distance return measure.get_raw_score(s1, s2)
def needleman_wunsch(s1, s2): """ This function computes the Needleman-Wunsch measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Needleman-Wunsch measure if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.needleman_wunsch('dva', 'deeva') 1.0 >>> em.needleman_wunsch('dva', None) nan """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # Create the similarity measure object measure = sm.NeedlemanWunsch() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)