def chapman_orderedname_compound_sim(src, dest, func1=soundex_sim, func2=smith_waterman_sim, qgraml=None): #soundex and smith waterman are used in java implementation # we have extended our suppor to change the functions. if src == dest: return float(1) if not src or not dest: #if one of them is null then it return 0 return float(0) if isinstance(src, list) and isinstance(dest, list): q_src = src q_dest = dest else: q_src = QGram(src, qgraml) q_dest = QGram(dest, qgraml) q_src_mag = len(q_src) q_dest_mag = len(q_dest) minTokens = min(q_src_mag, q_dest_mag) SKEW_AMOUNT = float(1) sumMatches = float(0) for i in xrange(1, minTokens + 1, 1): strWeightingAdjustment = ((float(1) / minTokens) + ( ((((minTokens - i) + float(0.5)) - (minTokens / float(2))) / minTokens) * SKEW_AMOUNT * (float(1) / minTokens))) sToken = q_src[q_src_mag - i] tToken = q_dest[q_dest_mag - i] found1 = func1(sToken, tToken) found2 = func2(sToken, tToken) sumMatches += ((float(0.5) * (found1 + found2)) * strWeightingAdjustment) return sumMatches '''
def matching_coefficient(src, dest, qgraml=None): """ Matching coefficient similarity For two lists X and Y, the Matching coefficient similarity is: :param str src, dest: two strings to be compared (or QGrams/list of qgrams) if src or dest is None , a '0' similarity is returned. :param int qgraml: the length of each q-gram; None for non-q-gram The default qgraml if not given is None. :returns: Matching coefficient similarity :rtype: float """ if src == dest: return float(1) if not src or not dest: #if one of them is null then it return 0 return float(0) if isinstance(src, list) and isinstance(dest, list): q_src = src q_dest = dest else: q_src = QGram(src, qgraml) q_dest = QGram(dest, qgraml) q_src_set = set(q_src) q_dest_set = set(q_dest) q_src_mag = len(q_src_set) q_dest_mag = len(q_dest_set) q_total = set(q_src + q_dest) q_union_mag = len(q_total) maxTokens = max(q_src_mag, q_dest_mag) common_terms = q_src_mag + q_dest_mag - q_union_mag return common_terms / maxTokens
def qgram_distance(src, dest, qgraml=3): """ QGram Distance similarity For two lists X and Y, the QGram Distance similarity is: :param str src, dest: two strings to be compared (or QGrams/list of qgrams) if src or dest is None , a '0' similarity is returned. :param int qgraml: the length of each q-gram; None for non-q-gram The default qgraml if not given is '3'. :returns: QGram Distance similarity :rtype: float >>>qgram_distance('Test String1','Test String2',qgraml=None) 0.5 >>>qgram_distance('Test String1','Test String2') 0.7857142857142857 """ if src == dest: return float(1.0) if not src or not dest: #if one of them is null then it return 0 return float(0.0) if isinstance(src, list) and isinstance(dest, list): q_src = src q_dest = dest else: q_src = QGram(src, qgraml) q_dest = QGram(dest, qgraml) q_src_set = set(q_src) q_dest_set = set(q_dest) q_src_mag = len(q_src_set) q_dest_mag = len(q_dest_set) q_total_set = set(q_src + q_dest) difference = 0 for token in q_total_set: matchingqgrams1 = 0 for stoken in q_src_set: if (stoken == token): matchingqgrams1 = matchingqgrams1 + 1 matchingqgrams2 = 0 for dtoken in q_dest_set: if (dtoken == token): matchingqgrams2 = matchingqgrams2 + 1 if matchingqgrams1 > matchingqgrams2: difference = difference + (matchingqgrams1 - matchingqgrams2) else: difference = difference + (matchingqgrams2 - matchingqgrams1) totalQGramsMatching = q_src_mag + q_dest_mag return (totalQGramsMatching - difference) / totalQGramsMatching
def eucledian_dist_sim(src, dest, qgraml=None): """ Eucledian similarity For two lists X and Y, the Eucledian similarity is: :param str src, dest: two strings to be compared (or QGrams/list of qgrams) if src or dest is None , a '0' similarity is returned. :param int qgraml: the length of each q-gram; None for non-q-gram The default qgraml if not given is None. :returns: Eucledian similarity :rtype: float >>> eucledian_dist_sim('I am a good boy','am I a good boy',qgraml=None) 1 >>> eucledian_dist_sim('Test String1','Test String2',qgraml=3) 0.8762820851736516 """ if src == dest: return float(1) if not src or not dest: #if one of them is null then it return 0 return float(0) if isinstance(src, list) and isinstance(dest, list): q_src = src q_dest = dest else: q_src = QGram(src, qgraml) q_dest = QGram(dest, qgraml) q_src_set = set(q_src) q_dest_set = set(q_dest) q_src_mag = len(q_src_set) q_dest_mag = len(q_dest_set) totalPossible = math.sqrt(q_src_mag * q_src_mag + q_dest_mag * q_dest_mag) totalDistance = float(0) q_alltokens = set(q_src + q_dest) for token in q_alltokens: countinsrc = 0 countindest = 0 for stoken in q_src_set: if (stoken == token): countinsrc = countinsrc + 1 for dtoken in q_dest_set: if (dtoken == token): countindest = countindest + 1 totalDistance = totalDistance + ((countinsrc - countindest) * (countinsrc - countindest)) totalDistance = math.sqrt(totalDistance) return (totalPossible - totalDistance) / totalPossible
def monge_elkan_sim(src, dest, qgraml=None, func=sim_cosine, symmetric=False): """ Monge Elkan Similarity For give 'src' and 'dest' strings, the Monge Elkan Similarity: :param str src : first string to be compared to :param str dest : second string to compare with :param int qgraml: the length of each q-gram; None for non-q-gram The default qgraml if not given is None. :param function func( optional ) : A function can be given as argument. The function given as input should take two qgrams/words and return the normalized similarity. :param boolean symmetric : this should specify 'True' or 'False'. If symmetric then the function executes with interchanging src and dest and then giving the mean similarity. :returns: return type of the given func ( Cosine Similarity by default. ) :rtype: float. ( the function should provided as argument should be able to return float as similarity for given two strings. ) >>>monge_elkan_sim('Test String1','Test String2',qgraml=None,func=sim_cosine,symmetric=False) 0.5 """ if src == dest: return float(1) if not dest or not src: return float(0) if isinstance(src, list) and isinstance(dest, list): q_src = src q_dest = dest else: q_src = QGram(src, qgraml) q_dest = QGram(dest, qgraml) sumMatches = float(0) for stoken in q_src: maxFound = float(0) for dtoken in q_dest: found = func(stoken, dtoken, qgraml) if found > maxFound: maxFound = found sumMatches = sumMatches + maxFound sim_res = sumMatches / len(q_src) if symmetric: sim_res = (sim_res + monge_elkan_sim(src, dest, qgraml, func)) / 2 return sim_res
def dice_sim(src, dest, qgraml=None): """ Dice similarity For two lists X and Y, the Dice similarity is: :param str src, dest: two strings to be compared (or QGrams/list of qgrams) if src or dest is None , a '0' similarity is returned. :param int qgraml: the length of each q-gram; None for non-q-gram The default qgraml if not given is None. :returns: Dice similarity :rtype: float >>> dice_sim('I am a good boy','am I a good boy',qgraml=None) 1 >>> dice_sim(Test String1','Test String2',qgraml=3) 0.7857142857142857 """ #returns some value between 0 and 1 # this is a constant i got from java lib if src == dest: return float(1) if not src or not dest: #if one of them is null then it return 0 return float(0) if isinstance(src, list) and isinstance(dest, list): q_src = src q_dest = dest else: q_src = QGram(src, qgraml) q_dest = QGram(dest, qgraml) q_src_set = set(q_src) q_dest_set = set(q_dest) q_total_set = set(q_src + q_dest) q_src_mag = len(q_src_set) q_dest_mag = len(q_dest_set) q_union_mag = len(q_total_set) common_terms = q_src_mag + q_dest_mag - q_union_mag # returns the Dice co-efficient return 2.0 * common_terms / (q_src_mag + q_dest_mag)
def sim_cosine(src, dest, qgraml=None): """ cosine similarity For two lists X and Y, the cosine similarity (Ochiai coefficient) is: :param str src, dest: two strings to be compared (or QGrams/list of qgrams) if src or dest is None , a '0' similarity is returned. :param int qgraml: the length of each q-gram; None for non-q-gram The default qgraml if not given is None. :returns: cosine similarity :rtype: float >>> sim_cosine('Test String1','Test String2',qgraml=3) 0.7857142857142857 >>> sim_cosine('University of Wisconsin Madison','UW Madison') 0.35355339059327373 """ if src == dest: return float(1) if not src or not dest: return float(0) if isinstance(src, list) and isinstance(dest, list): q_src = src q_dest = dest else: q_src = QGram(src, qgraml) q_dest = QGram(dest, qgraml) q_src_set = set(q_src) q_dest_set = set(q_dest) q_total_set = set(q_src + q_dest) q_src_mag = len(q_src_set) q_dest_mag = len(q_dest_set) q_union_mag = len(q_total_set) common_terms = q_src_mag + q_dest_mag - q_union_mag return common_terms / math.sqrt(q_src_mag * q_dest_mag)