Esempio n. 1
0
def jaro(s1, s2):
    """
    This function computes the Jaro measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Jaro measure if both the strings are not missing (i.e NaN),
        else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.jaro('MARTHA', 'MARHTA')
        0.9444444444444445
        >>> em.jaro(None, 'MARTHA')
        nan
    """

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    # Create the similarity measure object
    measure = sm.Jaro()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)
def jaro(s1, s2):
    """
    This function computes the Jaro measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Jaro measure if both the strings are not missing (i.e NaN),
        else  returns NaN.
    """

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN
    # if isinstance(s1, six.string_types):
    #     s1 = gh.remove_non_ascii(s1)
    # if isinstance(s2, six.string_types):
    #     s2 = gh.remove_non_ascii(s2)
    # Create the similarity measure object
    measure = sm.Jaro()
    if not (isinstance(s1, six.string_types) or isinstance(s1, bytes)):
        s1 = str(s1)

    if not (isinstance(s2, six.string_types) or isinstance(s2, bytes)):
        s2 = str(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)
def textdistance_jaro_winkler_distance(candidates, inp, min_score, winkler):
  res = []
  fun = py_stringmatching.JaroWinkler().get_raw_score if winkler else py_stringmatching.Jaro().get_raw_score
  for candidate in candidates:
    score = fun(candidate, inp)
    if score >= min_score:
      res.append((candidate, score))
  return res
Esempio n. 4
0
def jaro(s1, s2):
    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    s1 = helper.convert_to_str_unicode(s1)
    s2 = helper.convert_to_str_unicode(s2)

    measure = sm.Jaro()
    return measure.get_raw_score(s1, s2)
 def extract_jaro_distance(queried_name, predicted_name):
     jw = sm.Jaro()
     res = np.empty(len(queried_name), dtype=float)
     for i in tqdm(range(len(queried_name))):
         try:
             # res[i] = distance.get_jaro_distance(queried_name[i], predicted_name[i], winkler=False, scaling=0.1)
             # res[i] = jaro.jaro_metric(queried_name[i], predicted_name[i])
             res[i] = jw.get_raw_score(queried_name[i], predicted_name[i])
         except:
             print(i)
     return res
Esempio n. 6
0
    def __init__(self):
        self.similarity_function = [
            sm.BagDistance(),
            sm.Cosine(),
            sm.Dice(),
            sm.Editex(),
            sm.GeneralizedJaccard(),
            sm.Jaccard(),
            sm.Jaro(),
            sm.JaroWinkler(),
            sm.Levenshtein(),
            sm.OverlapCoefficient(),
            sm.TverskyIndex()
        ]

        self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
Esempio n. 7
0
def jaro(s1, s2):
    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN
    # if isinstance(s1, six.string_types):
    #     s1 = gh.remove_non_ascii(s1)
    # if isinstance(s2, six.string_types):
    #     s2 = gh.remove_non_ascii(s2)
    # Create the similarity measure object
    measure = sm.Jaro()
    if not(isinstance(s1, six.string_types) or isinstance(s1, bytes)):
        s1 = str(s1)

    if not(isinstance(s2, six.string_types) or isinstance(s2, bytes)):
        s2 = str(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)
Esempio n. 8
0
get_ipython().system('pip install py_stringmatching')
import py_stringmatching as sm

# # Token Based Similarities

# In[27]:

jac = sm.Jaccard()
df['Jaccard'] = df.apply(
    lambda x: jac.get_sim_score(x['aTokens'], x['bTokens']), axis=1)
df.head()

# In[28]:

jaro = sm.Jaro()

# !pip install pyjarowinkler
# from pyjarowinkler import distance
# def jaro_similarity(word1, word2):
#   return distance.get_jaro_distance(word1, word2, winkler=False, scaling=0.1)


def jaccard_similarity_general(tokens1, tokens2):
    intersection = []
    for token1 in list(set(tokens1)):
        for token2 in list(set(tokens2)):
            if jaro.get_sim_score(token1, token2) > 0.7:
                if token1 not in intersection:
                    intersection.append(token1)
                if token2 not in intersection:
Esempio n. 9
0
    # iteration #2: trim whitespaces from artist and track labels
    row[3] = row[3].strip()
    row[4] = row[4].strip()
    row[7] = row[7].strip()
    row[8] = row[8].strip()
    sampledList.append(row)
f.close()

# Converting every row in to a feature vector
featList = []
label = []
ws = ps.WhitespaceTokenizer()
for item in sampledList:
    fi = []

    jaro1 = ps.Jaro()

    # iteration #3:
    # pull the feature value to zero if none of the token pairs from either artist strings have a high
    # enough similarity score
    f1 = 0
    for t1 in ws.tokenize(item[3]):
        if max([jaro1.get_raw_score(t1, t2) for t2 in ws.tokenize(item[7])]) > .75:
            f1 = jaro1.get_raw_score(item[3], item[7])
            break

    # iteration #3:
    # if the artist doesn't match scale down the track similarity by a factor of 3
    # and if the track score isn't high enough pull it down to 0
    jaro2 = ps.Jaro()
    f2 = jaro1.get_raw_score(item[4], item[8])