예제 #1
0
def chapman_orderedname_compound_sim(src,
                                     dest,
                                     func1=soundex_sim,
                                     func2=smith_waterman_sim,
                                     qgraml=None):
    #soundex and smith waterman are used in java implementation
    # we have extended our suppor to change the functions.
    if src == dest:
        return float(1)
    if not src or not dest:  #if one of them is null then it return 0
        return float(0)
    if isinstance(src, list) and isinstance(dest, list):
        q_src = src
        q_dest = dest
    else:
        q_src = QGram(src, qgraml)
        q_dest = QGram(dest, qgraml)
    q_src_mag = len(q_src)
    q_dest_mag = len(q_dest)
    minTokens = min(q_src_mag, q_dest_mag)
    SKEW_AMOUNT = float(1)
    sumMatches = float(0)
    for i in xrange(1, minTokens + 1, 1):
        strWeightingAdjustment = ((float(1) / minTokens) + (
            ((((minTokens - i) + float(0.5)) -
              (minTokens / float(2))) / minTokens) * SKEW_AMOUNT *
            (float(1) / minTokens)))
        sToken = q_src[q_src_mag - i]
        tToken = q_dest[q_dest_mag - i]
        found1 = func1(sToken, tToken)
        found2 = func2(sToken, tToken)
        sumMatches += ((float(0.5) * (found1 + found2)) *
                       strWeightingAdjustment)
    return sumMatches
    '''
예제 #2
0
def matching_coefficient(src, dest, qgraml=None):
    """
    
	    Matching coefficient similarity 
	    For two lists X and Y, the Matching coefficient similarity  is:
	    :param str src, dest: two strings to be compared (or QGrams/list of qgrams)
	    					  if src or dest is None , a '0' similarity is returned.
	    :param int qgraml: the length of each q-gram; None for non-q-gram
	        				The default qgraml if not given is None.
	    :returns: Matching coefficient similarity
	    :rtype: float
	 
    """
    if src == dest:
        return float(1)
    if not src or not dest:  #if one of them is null then it return 0
        return float(0)
    if isinstance(src, list) and isinstance(dest, list):
        q_src = src
        q_dest = dest
    else:
        q_src = QGram(src, qgraml)
        q_dest = QGram(dest, qgraml)
    q_src_set = set(q_src)
    q_dest_set = set(q_dest)
    q_src_mag = len(q_src_set)
    q_dest_mag = len(q_dest_set)
    q_total = set(q_src + q_dest)
    q_union_mag = len(q_total)
    maxTokens = max(q_src_mag, q_dest_mag)
    common_terms = q_src_mag + q_dest_mag - q_union_mag
    return common_terms / maxTokens
예제 #3
0
def qgram_distance(src, dest, qgraml=3):
    """
    
	    QGram Distance similarity 
	    For two lists X and Y, the QGram Distance similarity  is:
	    :param str src, dest: two strings to be compared (or QGrams/list of qgrams)
	    					  if src or dest is None , a '0' similarity is returned.
	    :param int qgraml: the length of each q-gram; None for non-q-gram
	        				The default qgraml if not given is '3'.
	    :returns: QGram Distance similarity
	    :rtype: float
	 	>>>qgram_distance('Test String1','Test String2',qgraml=None)
	 	0.5
	 	>>>qgram_distance('Test String1','Test String2')
	 	0.7857142857142857

    """
    if src == dest:
        return float(1.0)
    if not src or not dest:  #if one of them is null then it return 0
        return float(0.0)
    if isinstance(src, list) and isinstance(dest, list):
        q_src = src
        q_dest = dest
    else:
        q_src = QGram(src, qgraml)
        q_dest = QGram(dest, qgraml)
    q_src_set = set(q_src)
    q_dest_set = set(q_dest)
    q_src_mag = len(q_src_set)
    q_dest_mag = len(q_dest_set)
    q_total_set = set(q_src + q_dest)

    difference = 0
    for token in q_total_set:
        matchingqgrams1 = 0
        for stoken in q_src_set:
            if (stoken == token):
                matchingqgrams1 = matchingqgrams1 + 1
        matchingqgrams2 = 0
        for dtoken in q_dest_set:
            if (dtoken == token):
                matchingqgrams2 = matchingqgrams2 + 1
        if matchingqgrams1 > matchingqgrams2:
            difference = difference + (matchingqgrams1 - matchingqgrams2)
        else:
            difference = difference + (matchingqgrams2 - matchingqgrams1)

    totalQGramsMatching = q_src_mag + q_dest_mag
    return (totalQGramsMatching - difference) / totalQGramsMatching
예제 #4
0
def eucledian_dist_sim(src, dest, qgraml=None):
    """
	 	    Eucledian similarity 
	    For two lists X and Y, the Eucledian similarity  is:
	    :param str src, dest: two strings to be compared (or QGrams/list of qgrams)
	    					  if src or dest is None , a '0' similarity is returned.
	    :param int qgraml: the length of each q-gram; None for non-q-gram
	        				The default qgraml if not given is None.
	    :returns: Eucledian similarity
	    :rtype: float
	    >>> eucledian_dist_sim('I am a good boy','am I a good boy',qgraml=None)
	    1
	    >>> eucledian_dist_sim('Test String1','Test String2',qgraml=3)
		0.8762820851736516
	 
	"""
    if src == dest:
        return float(1)
    if not src or not dest:  #if one of them is null then it return 0
        return float(0)
    if isinstance(src, list) and isinstance(dest, list):
        q_src = src
        q_dest = dest
    else:
        q_src = QGram(src, qgraml)
        q_dest = QGram(dest, qgraml)
    q_src_set = set(q_src)
    q_dest_set = set(q_dest)
    q_src_mag = len(q_src_set)
    q_dest_mag = len(q_dest_set)
    totalPossible = math.sqrt(q_src_mag * q_src_mag + q_dest_mag * q_dest_mag)
    totalDistance = float(0)
    q_alltokens = set(q_src + q_dest)
    for token in q_alltokens:
        countinsrc = 0
        countindest = 0
        for stoken in q_src_set:
            if (stoken == token):
                countinsrc = countinsrc + 1
        for dtoken in q_dest_set:
            if (dtoken == token):
                countindest = countindest + 1
        totalDistance = totalDistance + ((countinsrc - countindest) *
                                         (countinsrc - countindest))
    totalDistance = math.sqrt(totalDistance)
    return (totalPossible - totalDistance) / totalPossible
예제 #5
0
def monge_elkan_sim(src, dest, qgraml=None, func=sim_cosine, symmetric=False):
    """
    
	    Monge Elkan Similarity
	    For give 'src' and 'dest' strings, the Monge Elkan Similarity:
	    :param str src : first string to be compared to
	    :param str dest : second string to compare with
	    :param int qgraml: the length of each q-gram; None for non-q-gram
	        				The default qgraml if not given is None.
	    :param function func( optional ) : A function can be given as argument. The function
	    		given as input should take two qgrams/words and return the normalized similarity.
	    :param boolean symmetric : this should specify 'True' or 'False'. If symmetric then
	    		the function executes with interchanging src and dest and then giving the mean similarity. 
	    :returns: return type of the given func ( Cosine Similarity by default. )
	    :rtype: float. ( the function should provided as argument should be able to return 
	    				float as similarity for given two strings. )
		>>>monge_elkan_sim('Test String1','Test String2',qgraml=None,func=sim_cosine,symmetric=False)
		0.5

	"""
    if src == dest:
        return float(1)
    if not dest or not src:
        return float(0)
    if isinstance(src, list) and isinstance(dest, list):
        q_src = src
        q_dest = dest
    else:
        q_src = QGram(src, qgraml)
        q_dest = QGram(dest, qgraml)
    sumMatches = float(0)
    for stoken in q_src:
        maxFound = float(0)
        for dtoken in q_dest:
            found = func(stoken, dtoken, qgraml)
            if found > maxFound:
                maxFound = found
        sumMatches = sumMatches + maxFound
    sim_res = sumMatches / len(q_src)
    if symmetric:
        sim_res = (sim_res + monge_elkan_sim(src, dest, qgraml, func)) / 2
    return sim_res
예제 #6
0
def dice_sim(src, dest, qgraml=None):
    """
	    Dice similarity 
	    For two lists X and Y, the Dice similarity  is:
	    :param str src, dest: two strings to be compared (or QGrams/list of qgrams)
	    					  if src or dest is None , a '0' similarity is returned.
	    :param int qgraml: the length of each q-gram; None for non-q-gram
	        				The default qgraml if not given is None.
	    :returns: Dice similarity
	    :rtype: float
	    >>> dice_sim('I am a good boy','am I a good boy',qgraml=None)
	    1
	    >>> dice_sim(Test String1','Test String2',qgraml=3)
	    0.7857142857142857
	"""
    #returns some value between 0 and 1
    # this is a constant i got from java lib
    if src == dest:
        return float(1)
    if not src or not dest:  #if one of them is null then it return 0
        return float(0)
    if isinstance(src, list) and isinstance(dest, list):
        q_src = src
        q_dest = dest
    else:
        q_src = QGram(src, qgraml)
    q_dest = QGram(dest, qgraml)
    q_src_set = set(q_src)
    q_dest_set = set(q_dest)
    q_total_set = set(q_src + q_dest)
    q_src_mag = len(q_src_set)
    q_dest_mag = len(q_dest_set)
    q_union_mag = len(q_total_set)
    common_terms = q_src_mag + q_dest_mag - q_union_mag
    # returns the Dice co-efficient
    return 2.0 * common_terms / (q_src_mag + q_dest_mag)
예제 #7
0
def sim_cosine(src, dest, qgraml=None):
    """
    
	    cosine similarity 
	    For two lists X and Y, the cosine similarity (Ochiai coefficient) is:
	    :param str src, dest: two strings to be compared (or QGrams/list of qgrams)
	    					  if src or dest is None , a '0' similarity is returned.
	    :param int qgraml: the length of each q-gram; None for non-q-gram
	        				The default qgraml if not given is None.
	    :returns: cosine similarity
	    :rtype: float
	    >>> sim_cosine('Test String1','Test String2',qgraml=3)
	    0.7857142857142857
	    >>> sim_cosine('University of Wisconsin Madison','UW Madison')  
	    0.35355339059327373
	 
    """
    if src == dest:
        return float(1)
    if not src or not dest:
        return float(0)

    if isinstance(src, list) and isinstance(dest, list):
        q_src = src
        q_dest = dest
    else:
        q_src = QGram(src, qgraml)
        q_dest = QGram(dest, qgraml)
        q_src_set = set(q_src)
        q_dest_set = set(q_dest)
        q_total_set = set(q_src + q_dest)
    q_src_mag = len(q_src_set)
    q_dest_mag = len(q_dest_set)
    q_union_mag = len(q_total_set)
    common_terms = q_src_mag + q_dest_mag - q_union_mag
    return common_terms / math.sqrt(q_src_mag * q_dest_mag)