Esempio n. 1
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Levenshtein distance between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Levenshtein distance (int).

        Raises:
            TypeError : If the inputs are not strings.

        Examples:
            >>> lev = Levenshtein()
            >>> lev.get_raw_score('a', '')
            1
            >>> lev.get_raw_score('example', 'samples')
            3
            >>> lev.get_raw_score('levenshtein', 'frankenstein')
            6
        """

        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        if utils.sim_check_for_exact_match(string1, string2):
            return 0.0

        return levenshtein(string1, string2)
Esempio n. 2
0
def levenshtein(string1, string2):
    """
    Computes the Levenshtein distance between two strings.

    Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string
    is carried out using a sequence of the following operators: delete a character, insert a character, and
    substitute one character for another.

    Args:
        string1,string2 (str): Input strings

    Returns:
        Levenshtein distance (int)

    Raises:
        TypeError : If the inputs are not strings

    Examples:
        >>> levenshtein('a', '')
        1
        >>> levenshtein('example', 'samples')
        3
        >>> levenshtein('levenshtein', 'frankenstein')
        6



    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)
    if utils.sim_check_for_exact_match(string1, string2):
        return 0.0

    ins_cost, del_cost, sub_cost, trans_cost = (1, 1, 1, 1)

    len_str1 = len(string1)
    len_str2 = len(string2)

    if len_str1 == 0:
        return len_str2 * ins_cost

    if len_str2 == 0:
        return len_str1 * del_cost

    d_mat = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.int)

    for i in _range(len_str1 + 1):
        d_mat[i, 0] = i * del_cost

    for j in _range(len_str2 + 1):
        d_mat[0, j] = j * ins_cost

    for i in _range(len_str1):
        for j in _range(len_str2):
            d_mat[i + 1, j + 1] = min(
                d_mat[i + 1, j] + ins_cost, d_mat[i, j + 1] + del_cost,
                d_mat[i, j] + (sub_cost if string1[i] != string2[j] else 0))

    return d_mat[len_str1, len_str2]
    def get_raw_score(self, string1, string2):
        """Computes the raw Levenshtein distance between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Levenshtein distance (int).

        Raises:
            TypeError : If the inputs are not strings.

        Examples:
            >>> lev = Levenshtein()
            >>> lev.get_raw_score('a', '')
            1
            >>> lev.get_raw_score('example', 'samples')
            3
            >>> lev.get_raw_score('levenshtein', 'frankenstein')
            6
        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        if utils.sim_check_for_exact_match(string1, string2):
            return 0.0

        return levenshtein(string1, string2)
Esempio n. 4
0
def monge_elkan(bag1, bag2, sim_func=jaro_winkler):
    """
    Compute Monge-Elkan similarity measure between two bags (lists).

    The Monge-Elkan similarity measure is a type of Hybrid similarity measure that combine the benefits of
    sequence-based and set-based methods. This can be effective for domains in which more control is needed
    over the similarity measure. It implicitly uses a secondary similarity measure, such as levenshtein to compute
    over all similarity score.

    Args:
        bag1,bag2 (list): Input lists

        sim_func (function): Secondary similarity function. This is expected to be a sequence-based
            similarity measure (defaults to levenshtein)

    Returns:
        Monge-Elkan similarity score (float)

    Raises:
        TypeError : If the inputs are not lists or if one of the inputs is None


    Examples:
        >>> monge_elkan(['Niall'], ['Neal'])
        0.8049999999999999
        >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
        0.8677218614718616
        >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=needleman_wunsch)
        2.0
        >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=affine)
        2.25
        >>> monge_elkan([''], ['a'])
        0.0
        >>> monge_elkan(['Niall'], ['Nigel'])
        0.7866666666666667

    References:
        * Principles of Data Integration book
    """
    # input validations
    utils.sim_check_for_none(bag1, bag2)
    utils.sim_check_for_list_or_set_inputs(bag1, bag2)
    # if exact match return 1.0
    if utils.sim_check_for_exact_match(bag1, bag2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(bag1, bag2):
        return 0
    # aggregated sum of all the max sim score of all the elements in bag1
    # with elements in bag2
    sum_of_maxes = 0
    for t1 in bag1:
        max_sim = float('-inf')
        for t2 in bag2:
            max_sim = max(max_sim, sim_func(t1, t2))
        sum_of_maxes += max_sim
    sim = float(sum_of_maxes) / float(len(bag1))
    return sim
Esempio n. 5
0
def monge_elkan(bag1, bag2, sim_func=jaro_winkler):
    """
    Compute Monge-Elkan similarity measure between two bags (lists).

    The Monge-Elkan similarity measure is a type of Hybrid similarity measure that combine the benefits of
    sequence-based and set-based methods. This can be effective for domains in which more control is needed
    over the similarity measure. It implicitly uses a secondary similarity measure, such as levenshtein to compute
    over all similarity score.

    Args:
        bag1,bag2 (list): Input lists

        sim_func (function): Secondary similarity function. This is expected to be a sequence-based
            similarity measure (defaults to levenshtein)

    Returns:
        Monge-Elkan similarity score (float)

    Raises:
        TypeError : If the inputs are not lists or if one of the inputs is None


    Examples:
        >>> monge_elkan(['Niall'], ['Neal'])
        0.8049999999999999
        >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
        0.8677218614718616
        >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=needleman_wunsch)
        2.0
        >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=affine)
        2.25
        >>> monge_elkan([''], ['a'])
        0.0
        >>> monge_elkan(['Niall'], ['Nigel'])
        0.7866666666666667

    References:
        * Principles of Data Integration book
    """
    # input validations
    utils.sim_check_for_none(bag1, bag2)
    utils.sim_check_for_list_or_set_inputs(bag1, bag2)
    # if exact match return 1.0
    if utils.sim_check_for_exact_match(bag1, bag2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(bag1, bag2):
        return 0
    # aggregated sum of all the max sim score of all the elements in bag1
    # with elements in bag2
    sum_of_maxes = 0
    for t1 in bag1:
        max_sim = float('-inf')
        for t2 in bag2:
            max_sim = max(max_sim, sim_func(t1, t2))
        sum_of_maxes += max_sim
    sim = float(sum_of_maxes) / float(len(bag1))
    return sim
    def get_raw_score(self, bag1, bag2):
        """Computes the raw Monge-Elkan score between two bags (lists).

        Args:
            bag1,bag2 (list): Input lists.

        Returns:
            Monge-Elkan similarity score (float).

        Raises:
            TypeError : If the inputs are not lists or if one of the inputs is None.

        Examples:
            >>> me = MongeElkan()
            >>> me.get_raw_score(['Niall'], ['Neal'])
            0.8049999999999999
            >>> me.get_raw_score(['Niall'], ['Nigel'])
            0.7866666666666667
            >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
            0.8677218614718616
            >>> me.get_raw_score([''], ['a'])
            0.0
            >>> me = MongeElkan(sim_func=NeedlemanWunsch().get_raw_score)
            >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
            2.0
            >>> me = MongeElkan(sim_func=Affine().get_raw_score)
            >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
            2.25

        References:
            * Principles of Data Integration book
        """
        
        # input validations
        utils.sim_check_for_none(bag1, bag2)
        utils.sim_check_for_list_or_set_inputs(bag1, bag2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(bag1, bag2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(bag1, bag2):
            return 0

        # aggregated sum of all the max sim score of all the elements in bag1
        # with elements in bag2
        sum_of_maxes = 0
        for el1 in bag1:
            max_sim = float('-inf')
            for el2 in bag2:
                max_sim = max(max_sim, self.sim_func(el1, el2))
            sum_of_maxes += max_sim

        sim = float(sum_of_maxes) / float(len(bag1))

        return sim
Esempio n. 7
0
    def get_raw_score(self, bag1, bag2):
        """Computes the raw Monge-Elkan score between two bags (lists).

        Args:
            bag1,bag2 (list): Input lists.

        Returns:
            Monge-Elkan similarity score (float).

        Raises:
            TypeError : If the inputs are not lists or if one of the inputs is None.

        Examples:
            >>> me = MongeElkan()
            >>> me.get_raw_score(['Niall'], ['Neal'])
            0.8049999999999999
            >>> me.get_raw_score(['Niall'], ['Nigel'])
            0.7866666666666667
            >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
            0.8677218614718616
            >>> me.get_raw_score([''], ['a'])
            0.0
            >>> me = MongeElkan(sim_func=NeedlemanWunsch().get_raw_score)
            >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
            2.0
            >>> me = MongeElkan(sim_func=Affine().get_raw_score)
            >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
            2.25

        References:
            * Principles of Data Integration book
        """

        # input validations
        utils.sim_check_for_none(bag1, bag2)
        utils.sim_check_for_list_or_set_inputs(bag1, bag2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(bag1, bag2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(bag1, bag2):
            return 0

        # aggregated sum of all the max sim score of all the elements in bag1
        # with elements in bag2
        sum_of_maxes = 0
        for el1 in bag1:
            max_sim = float('-inf')
            for el2 in bag2:
                max_sim = max(max_sim, self.sim_func(el1, el2))
            sum_of_maxes += max_sim

        sim = float(sum_of_maxes) / float(len(bag1))

        return sim
Esempio n. 8
0
    def get_raw_score(self, set1, set2):
        """
        Computes the Tversky index similarity between two sets.

        The Tversky index is an asymmetric similarity measure on sets that compares a variant to a prototype. The
        Tversky index can be seen as a generalization of Dice's coefficient and Tanimoto coefficient.

        For sets X and Y the Tversky index is a number between 0 and 1 given by:
        :math:`tversky_index(X, Y) = \\frac{|X \\cap Y|}{|X \\cap Y| + \alpha |X-Y| + \beta |Y-X|}`
        where, :math: \alpha, \beta >=0

        Args:
            set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

        Returns:
            Tversly index similarity (float)

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

        Examples:
            >>> tvi = TverskyIndex()
            >>> tvi.get_raw_score(['data', 'science'], ['data'])
            0.6666666666666666
            >>> tvi.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
            0.5
            >>> tvi.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
            0.5454545454545454
            >>> tvi = TverskyIndex(0.5, 0.5)
            >>> tvi.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
            0.5454545454545454
            >>> tvi = TverskyIndex(beta=0.5)
            >>> tvi.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
            0.5
        """
        # input validations
        utils.sim_check_for_none(set1, set2)
        utils.sim_check_for_list_or_set_inputs(set1, set2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(set1, set2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(set1, set2):
            return 0

        if not isinstance(set1, set):
            set1 = set(set1)
        if not isinstance(set2, set):
            set2 = set(set2)
        intersection = float(len(set1 & set2))

        return 1.0 * intersection / (intersection +
                                     (self.alpha * len(set1 - set2)) +
                                     (self.beta * len(set2 - set1)))
    def get_raw_score(self, set1, set2):
        """
        Computes the Tversky index similarity between two sets.

        The Tversky index is an asymmetric similarity measure on sets that compares a variant to a prototype. The
        Tversky index can be seen as a generalization of Dice's coefficient and Tanimoto coefficient.

        For sets X and Y the Tversky index is a number between 0 and 1 given by:
        :math:`tversky_index(X, Y) = \\frac{|X \\cap Y|}{|X \\cap Y| + \alpha |X-Y| + \beta |Y-X|}`
        where, :math: \alpha, \beta >=0

        Args:
            set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

        Returns:
            Tversly index similarity (float)

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

        Examples:
            >>> tvi = TverskyIndex()
            >>> tvi.get_raw_score(['data', 'science'], ['data'])
            0.6666666666666666
            >>> tvi.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
            0.5
            >>> tvi.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
            0.5454545454545454
            >>> tvi = TverskyIndex(0.5, 0.5)
            >>> tvi.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
            0.5454545454545454
            >>> tvi = TverskyIndex(beta=0.5)
            >>> tvi.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
            0.5
        """
        # input validations
        utils.sim_check_for_none(set1, set2)
        utils.sim_check_for_list_or_set_inputs(set1, set2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(set1, set2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(set1, set2):
            return 0

        if not isinstance(set1, set):
            set1 = set(set1)
        if not isinstance(set2, set):
            set2 = set(set2)
        intersection = float(len(set1 & set2))

        return 1.0 * intersection / (intersection +
            (self.alpha * len(set1 - set2)) + (self.beta * len(set2 - set1)))
    def get_raw_score(self, string1, string2):
        """
        Computes the bag distance between two strings.

        For two strings X and Y, the Bag distance is:
        :math:`max( |bag(string1)-bag(string2)|, |bag(string2)-bag(string1)| )`

        Args:
            string1,string2 (str): Input strings

        Returns:
            Bag distance (int)

        Raises:
            TypeError : If the inputs are not strings

        Examples:
            >>> bd = BagDistance()
            >>> bd.get_raw_score('cat', 'hat')
            1
            >>> bd.get_raw_score('Niall', 'Neil')
            2
            >>> bd.get_raw_score('aluminum', 'Catalan')
            5
            >>> bd.get_raw_score('ATCG', 'TAGC')
            0
            >>> bd.get_raw_score('abcde', 'xyz')
            5

        References:
            * String Matching with Metric Trees Using an Approximate Distance: http://www-db.disi.unibo.it/research/papers/SPIRE02.pdf
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)
        if utils.sim_check_for_exact_match(string1, string2):
            return 0

        len_str1 = len(string1)
        len_str2 = len(string2)

        if len_str1 == 0:
            return len_str2

        if len_str2 == 0:
            return len_str1

        bag1 = collections.Counter(string1)
        bag2 = collections.Counter(string2)

        size1 = sum((bag1 - bag2).values())
        size2 = sum((bag2 - bag1).values())

        # returning the max of difference of sets
        return max(size1, size2)
Esempio n. 11
0
    def get_raw_score(self, string1, string2):
        """
        Computes the bag distance between two strings.

        For two strings X and Y, the Bag distance is:
        :math:`max( |bag(string1)-bag(string2)|, |bag(string2)-bag(string1)| )`

        Args:
            string1,string2 (str): Input strings

        Returns:
            Bag distance (int)

        Raises:
            TypeError : If the inputs are not strings

        Examples:
            >>> bd = BagDistance()
            >>> bd.get_raw_score('cat', 'hat')
            1
            >>> bd.get_raw_score('Niall', 'Neil')
            2
            >>> bd.get_raw_score('aluminum', 'Catalan')
            5
            >>> bd.get_raw_score('ATCG', 'TAGC')
            0
            >>> bd.get_raw_score('abcde', 'xyz')
            5

        References:
            * http://www.icmlc.org/icmlc2011/018_icmlc2011.pdf
        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)
        if utils.sim_check_for_exact_match(string1, string2):
            return 0

        len_str1 = len(string1)
        len_str2 = len(string2)

        if len_str1 == 0:
            return len_str2

        if len_str2 == 0:
            return len_str1

        bag1 = collections.Counter(string1)
        bag2 = collections.Counter(string2)

        size1 = sum((bag1 - bag2).values())
        size2 = sum((bag2 - bag1).values())

        # returning the max of difference of sets
        return max(size1, size2)
Esempio n. 12
0
    def get_word_vector_similarities_simple(self, bag1, bag2):
        # input validations
        utils.sim_check_for_none(bag1, bag2)
        utils.sim_check_for_list_or_set_inputs(bag1, bag2)

        # if the strings match exactly return 1.0
        if utils.sim_check_for_exact_match(bag1, bag2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(bag1, bag2):
            return 0

        # term frequency for input strings
        tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)

        # if corpus is not provided treat input string as corpus
        curr_df, corpus_size = (self.__document_frequency, self.__corpus_size)

        # calculating the term sim score against the input string 2,
        # construct similarity map
        similarity_map = {}
        for term_x in tf_x:
            max_score = 0.0
            for term_y in tf_y:
                score = self.sim_func(term_x, term_y)
                # adding sim only if it is above threshold and
                # highest for this element
                if score > self.threshold and score > max_score:
                    similarity_map[term_x] = (term_x, term_y, score)
                    max_score = score

        # position of first string, second string and sim score
        # in the tuple
        first_string_pos = 0
        second_string_pos = 1
        sim_score_pos = 2

        # create a word vector with all the words in the document collection for every comparision.
        # if the word exist in this similarity-map, add the soft TF/ID value. If not, add a 0
        word_similarities_vector = np.zeros(len(curr_df))

        for idx, element in enumerate(curr_df.keys()):
            if element in similarity_map:
                sim = similarity_map[element]
                word_similarities_vector[idx] = sim[sim_score_pos]
            else:
                word_similarities_vector[idx] = 0

        return word_similarities_vector
Esempio n. 13
0
def overlap_coefficient(set1, set2):
    """
    Computes the overlap coefficient between two sets.

    The overlap coefficient is a similarity measure related to the Jaccard
    measure  that measures the overlap between two sets, and is defined as the size of the intersection divided by
    the smaller of the size of the two sets.

    For two sets X and Y, the overlap coefficient is:

    :math:`overlap\\_coefficient(X, Y) = \\frac{|X \\cap Y|}{\\min(|X|, |Y|)}`

    Args:
        set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

    Returns:
        Overlap coefficient (float)

    Raises:
        TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

    Examples:
        >>> (overlap_coefficient([], [])
        1.0
        >>> overlap_coefficient([], ['data'])
        0
        >>> overlap_coefficient(['data', 'science'], ['data'])
        1.0

    References:
        * Wikipedia article : https://en.wikipedia.org/wiki/Overlap_coefficient
        * Simmetrics library

    """
    # input validations
    utils.sim_check_for_none(set1, set2)
    utils.sim_check_for_list_or_set_inputs(set1, set2)
    # if exact match return 1.0
    if utils.sim_check_for_exact_match(set1, set2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(set1, set2):
        return 0
    if not isinstance(set1, set):
        set1 = set(set1)
    if not isinstance(set2, set):
        set2 = set(set2)

    return float(len(set1 & set2)) / min(len(set1), len(set2))
Esempio n. 14
0
def overlap_coefficient(set1, set2):
    """
    Computes the overlap coefficient between two sets.

    The overlap coefficient is a similarity measure related to the Jaccard
    measure  that measures the overlap between two sets, and is defined as the size of the intersection divided by
    the smaller of the size of the two sets.

    For two sets X and Y, the overlap coefficient is:

    :math:`overlap\\_coefficient(X, Y) = \\frac{|X \\cap Y|}{\\min(|X|, |Y|)}`

    Args:
        set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

    Returns:
        Overlap coefficient (float)

    Raises:
        TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

    Examples:
        >>> (overlap_coefficient([], [])
        1.0
        >>> overlap_coefficient([], ['data'])
        0
        >>> overlap_coefficient(['data', 'science'], ['data'])
        1.0

    References:
        * Wikipedia article : https://en.wikipedia.org/wiki/Overlap_coefficient
        * Simmetrics library

    """
    # input validations
    utils.sim_check_for_none(set1, set2)
    utils.sim_check_for_list_or_set_inputs(set1, set2)
    # if exact match return 1.0
    if utils.sim_check_for_exact_match(set1, set2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(set1, set2):
        return 0
    if not isinstance(set1, set):
        set1 = set(set1)
    if not isinstance(set2, set):
        set2 = set(set2)

    return float(len(set1 & set2)) / min(len(set1), len(set2))
Esempio n. 15
0
def jaccard(set1, set2):
    """
    Computes the Jaccard measure between two sets.

    The Jaccard measure, also known as the Jaccard similarity coefficient, is a statistic used for comparing
    the similarity and diversity of sample sets. The Jaccard coefficient measures similarity between finite sample
    sets, and is defined as the size of the intersection divided by the size of the union of the sample sets.


    For two sets X and Y, the Jaccard measure is:

    :math:`jaccard(X, Y) = \\frac{|X \\cap Y|}{|X| \\cup |Y|}`


    Args:
        set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

    Returns:
        Jaccard similarity (float)

    Raises:
        TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

    Examples:
        >>> jaccard(['data', 'science'], ['data'])
        0.5
        >>> jaccard({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
        0.375
        >>> jaccard(['data', 'management'], ['data', 'data', 'science'])
        0.3333333333333333
    """
    # input validations
    utils.sim_check_for_none(set1, set2)
    utils.sim_check_for_list_or_set_inputs(set1, set2)
    # if exact match return 1.0
    if utils.sim_check_for_exact_match(set1, set2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(set1, set2):
        return 0
    if not isinstance(set1, set):
        set1 = set(set1)
    if not isinstance(set2, set):
        set2 = set(set2)
    return float(len(set1 & set2)) / float(len(set1 | set2))
Esempio n. 16
0
def cosine(set1, set2):
    """
    Computes the cosine similarity between two sets.

    For two sets X and Y, the cosine similarity is:

    :math:`cosine(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}`


    Args:
        set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

    Returns:
        Cosine similarity (float)

    Raises:
        TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

    Examples:
     >>> cosine(['data', 'science'], ['data'])
     0.7071067811865475
     >>> cosine(['data', 'data', 'science'], ['data', 'management'])
     0.4999999999999999
     >>> cosine([], ['data'])
     0.0

    References:
        * String similarity joins: An Experimental Evaluation (VLDB 2014)
        * Project flamingo : Mike carey, Vernica
    """
    # input validations
    utils.sim_check_for_none(set1, set2)
    utils.sim_check_for_list_or_set_inputs(set1, set2)
    # if exact match return 1.0
    if utils.sim_check_for_exact_match(set1, set2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(set1, set2):
        return 0
    if not isinstance(set1, set):
        set1 = set(set1)
    if not isinstance(set2, set):
        set2 = set(set2)
    return float(len(set1 & set2)) / (math.sqrt(float(len(set1))) *
                                      math.sqrt(float(len(set2))))
Esempio n. 17
0
def jaccard(set1, set2):
    """
    Computes the Jaccard measure between two sets.

    The Jaccard measure, also known as the Jaccard similarity coefficient, is a statistic used for comparing
    the similarity and diversity of sample sets. The Jaccard coefficient measures similarity between finite sample
    sets, and is defined as the size of the intersection divided by the size of the union of the sample sets.


    For two sets X and Y, the Jaccard measure is:

    :math:`jaccard(X, Y) = \\frac{|X \\cap Y|}{|X| \\cup |Y|}`


    Args:
        set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

    Returns:
        Jaccard similarity (float)

    Raises:
        TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

    Examples:
        >>> jaccard(['data', 'science'], ['data'])
        0.5
        >>> jaccard({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
        0.375
        >>> jaccard(['data', 'management'], ['data', 'data', 'science'])
        0.3333333333333333
    """
    # input validations
    utils.sim_check_for_none(set1, set2)
    utils.sim_check_for_list_or_set_inputs(set1, set2)
    # if exact match return 1.0
    if utils.sim_check_for_exact_match(set1, set2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(set1, set2):
        return 0
    if not isinstance(set1, set):
        set1 = set(set1)
    if not isinstance(set2, set):
        set2 = set(set2)
    return float(len(set1 & set2)) / float(len(set1 | set2))
    def get_raw_score(self, set1, set2):
        """Computes the raw Dice score between two sets. This score is already in [0,1].

        Args:
            set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

        Returns:
            Dice similarity score (float).

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

        Examples:
            >>> dice = Dice()
            >>> dice.get_raw_score(['data', 'science'], ['data'])
            0.6666666666666666
            >>> dice.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
            0.5454545454545454
            >>> dice.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
            0.5

        References:
            * Wikipedia article : https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient
            * SimMetrics library.
        """

        # input validations
        utils.sim_check_for_none(set1, set2)
        utils.sim_check_for_list_or_set_inputs(set1, set2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(set1, set2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(set1, set2):
            return 0

        if not isinstance(set1, set):
            set1 = set(set1)
        if not isinstance(set2, set):
            set2 = set(set2)

        return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
Esempio n. 19
0
    def get_raw_score(self, set1, set2):
        """Computes the raw overlap coefficient score between two sets.

        Args:
            set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

        Returns:
            Overlap coefficient (float).

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

        Examples:
            >>> oc = OverlapCoefficient()
            >>> oc.get_raw_score(['data', 'science'], ['data'])
            1.0
            >>> oc.get_raw_score([], [])
            1.0
            >>> oc.get_raw_score([], ['data'])
            0

        References:
            * Wikipedia article : https://en.wikipedia.org/wiki/Overlap_coefficient
            * SimMetrics library
        """

        # input validations
        utils.sim_check_for_none(set1, set2)
        utils.sim_check_for_list_or_set_inputs(set1, set2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(set1, set2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(set1, set2):
            return 0

        if not isinstance(set1, set):
            set1 = set(set1)
        if not isinstance(set2, set):
            set2 = set(set2)

        return float(len(set1 & set2)) / min(len(set1), len(set2))
Esempio n. 20
0
def cosine(set1, set2):
    """
    Computes the cosine similarity between two sets.

    For two sets X and Y, the cosine similarity is:

    :math:`cosine(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}`


    Args:
        set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

    Returns:
        Cosine similarity (float)

    Raises:
        TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

    Examples:
     >>> cosine(['data', 'science'], ['data'])
     0.7071067811865475
     >>> cosine(['data', 'data', 'science'], ['data', 'management'])
     0.4999999999999999
     >>> cosine([], ['data'])
     0.0

    References:
        * String similarity joins: An Experimental Evaluation (VLDB 2014)
        * Project flamingo : Mike carey, Vernica
    """
    # input validations
    utils.sim_check_for_none(set1, set2)
    utils.sim_check_for_list_or_set_inputs(set1, set2)
    # if exact match return 1.0
    if utils.sim_check_for_exact_match(set1, set2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(set1, set2):
        return 0
    if not isinstance(set1, set):
        set1 = set(set1)
    if not isinstance(set2, set):
        set2 = set(set2)
    return float(len(set1 & set2)) / (math.sqrt(float(len(set1))) * math.sqrt(float(len(set2))))
Esempio n. 21
0
    def get_raw_score(self, set1, set2):
        """Computes the raw Dice score between two sets. This score is already in [0,1].

        Args:
            set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

        Returns:
            Dice similarity score (float).

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

        Examples:
            >>> dice = Dice()
            >>> dice.get_raw_score(['data', 'science'], ['data'])
            0.6666666666666666
            >>> dice.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
            0.5454545454545454
            >>> dice.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
            0.5

        References:
            * Wikipedia article : https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient
            * SimMetrics library.
        """
        
        # input validations
        utils.sim_check_for_none(set1, set2)
        utils.sim_check_for_list_or_set_inputs(set1, set2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(set1, set2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(set1, set2):
            return 0

        if not isinstance(set1, set):
            set1 = set(set1)
        if not isinstance(set2, set):
            set2 = set(set2)

        return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
Esempio n. 22
0
    def get_raw_score(self, set1, set2):
        """Computes the raw cosine score between two sets.

        Args:
            set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

        Returns:
            Cosine similarity (float)

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

        Examples:
            >>> cos = Cosine()
            >>> cos.get_raw_score(['data', 'science'], ['data'])
            0.7071067811865475
            >>> cos.get_raw_score(['data', 'data', 'science'], ['data', 'management'])
            0.4999999999999999
            >>> cos.get_raw_score([], ['data'])
            0.0

        References:
            * String similarity joins: An Experimental Evaluation (a paper appearing in the VLDB 2014 Conference).
            * Project Flamingo at http://flamingo.ics.uci.edu.
        """
        # input validations
        utils.sim_check_for_none(set1, set2)
        utils.sim_check_for_list_or_set_inputs(set1, set2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(set1, set2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(set1, set2):
            return 0

        if not isinstance(set1, set):
            set1 = set(set1)
        if not isinstance(set2, set):
            set2 = set(set2)

        return float(len(set1 & set2)) / (math.sqrt(float(len(set1))) *
                                          math.sqrt(float(len(set2))))
Esempio n. 23
0
    def get_raw_score(self, set1, set2):
        """Computes the raw Jaccard score between two sets.

        Args:
            set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

        Returns:
            Jaccard similarity score (float).

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

        Examples:
            >>> jac = Jaccard()
            >>> jac.get_raw_score(['data', 'science'], ['data'])
            0.5
            >>> jac.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
            0.375
            >>> jac.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
            0.3333333333333333
        """

        # input validations
        utils.sim_check_for_none(set1, set2)
        utils.sim_check_for_list_or_set_inputs(set1, set2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(set1, set2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(set1, set2):
            return 0

        if not isinstance(set1, set):
            set1 = set(set1)
        if not isinstance(set2, set):
            set2 = set(set2)

        return float(len(set1 & set2)) / float(len(set1 | set2))
Esempio n. 24
0
    def get_raw_score(self, set1, set2):
        """Computes the raw Jaccard score between two sets.

        Args:
            set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.

        Returns:
            Jaccard similarity score (float).

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.

        Examples:
            >>> jac = Jaccard()
            >>> jac.get_raw_score(['data', 'science'], ['data'])
            0.5
            >>> jac.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
            0.375
            >>> jac.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
            0.3333333333333333
        """
        
        # input validations
        utils.sim_check_for_none(set1, set2)
        utils.sim_check_for_list_or_set_inputs(set1, set2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(set1, set2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(set1, set2):
            return 0

        if not isinstance(set1, set):
            set1 = set(set1)
        if not isinstance(set2, set):
            set2 = set(set2)

        return float(len(set1 & set2)) / float(len(set1 | set2))
Esempio n. 25
0
def dice(set1, set2):
    """
    Computes the Dice similarity coefficient between two sets.
    The similarity is defined as twice the shared information (intersection) divided by sum of cardinalities.
    For two sets X and Y, the Dice similarity coefficient is:
    :math:`dice(X, Y) = \\frac{2 * |X \\cap Y|}{|X| + |Y|}`
    Args:
        set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
    Returns:
        Dice similarity coefficient (float)
    Raises:
        TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
    Examples:
        >>> dice(['data', 'science'], ['data'])
        0.6666666666666666
        >>> dice({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
        0.5454545454545454
        >>> dice(['data', 'management'], ['data', 'data', 'science'])
        0.5
    References:
        * Wikipedia article : https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient
        * Simmetrics library
    """
    # input validations
    utils.sim_check_for_none(set1, set2)
    utils.sim_check_for_list_or_set_inputs(set1, set2)
    # if exact match return 1.0
    if utils.sim_check_for_exact_match(set1, set2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(set1, set2):
        return 0
    if not isinstance(set1, set):
        set1 = set(set1)
    if not isinstance(set2, set):
        set2 = set(set2)
    return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
Esempio n. 26
0
def tfidf(bag1, bag2, corpus_list=None, dampen=False):
    """
    Compute tfidf measures between two lists given the corpus information.
    This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to find documents that
    are relevant to keyword queries.
    The intuition underlying the TF/IDF measure is that two strings are similar if they share distinguishing terms.

    Args:
        bag1,bag2 (list): Input lists

        corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None,
            the input list are considered the only corpus.

        dampen (boolean): Flag to indicate whether 'log' should be applied to tf and idf measure.

    Returns:
        TF-IDF measure between the input lists (float)

    Raises:
        TypeError : If the inputs are not lists or if one of the inputs is None


    Examples:
        >>> tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']])
        0.17541160386140586
        >>> tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], True)
        0.11166746710505392
        >>> tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']])
        0.5547001962252291
        >>> tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']])
        0.0
        >>> tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']], True)
        0.0
        >>> tfidf(['a', 'b', 'a'], ['a'])
        0.7071067811865475
    """
    # input validations
    utils.sim_check_for_none(bag1, bag2)
    utils.sim_check_for_list_or_set_inputs(bag1, bag2)
    # if the strings match exactly return 1.0
    if utils.sim_check_for_exact_match(bag1, bag2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(bag1, bag2):
        return 0
    # if corpus is not provided treat input string as corpus
    if corpus_list is None:
        corpus_list = [bag1, bag2]
    corpus_size = len(corpus_list)
    # term frequency for input strings
    tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
    # number of documents an element appeared
    element_freq = {}
    # set of unique element
    total_unique_elements = set()
    for document in corpus_list:
        temp_set = set()
        for element in document:
            # adding element only if it is present in one of two input string
            if element in bag1 or element in bag2:
                temp_set.add(element)
                total_unique_elements.add(element)
        # update element document frequency for this document
        for element in temp_set:
            element_freq[element] = element_freq[element] + 1 if element in element_freq else 1
    idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
    # tfidf calculation
    for element in total_unique_elements:
        idf_element = corpus_size * 1.0 / element_freq[element]
        v_x = 0 if element not in tf_x else (math.log(idf_element) * math.log(tf_x[element] + 1)) if dampen else (
            idf_element * tf_x[element])
        v_y = 0 if element not in tf_y else (math.log(idf_element) * math.log(tf_y[element] + 1)) if dampen else (
            idf_element * tf_y[element])
        v_x_y += v_x * v_y
        v_x_2 += v_x * v_x
        v_y_2 += v_y * v_y
    return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
    def get_raw_score(self, bag1, bag2):
        """Computes the raw TF/IDF score between two lists.

        Args:
            bag1,bag2 (list): Input lists.

        Returns:
            TF/IDF score between the input lists (float).

        Raises:
            TypeError : If the inputs are not lists or if one of the inputs is None.

        Examples:
            
            >>> # here the corpus is a list of three strings that 
            >>> # have been tokenized into three lists of tokens
            >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']])
            >>> tfidf.get_raw_score(['a', 'b', 'a'], ['b', 'c'])
            0.7071067811865475
            >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.0
            >>> tfidf = TfIdf([['x', 'y'], ['w'], ['q']])
            >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.0
            >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], False)
            >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c'])
            0.25298221281347033
            >>> tfidf = TfIdf(dampen=False)
            >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.7071067811865475
            >>> tfidf = TfIdf()
            >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.0
        """
        # input validations
        utils.sim_check_for_none(bag1, bag2)
        utils.sim_check_for_list_or_set_inputs(bag1, bag2)

        # if the strings match exactly return 1.0
        if utils.sim_check_for_exact_match(bag1, bag2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(bag1, bag2):
            return 0

        # term frequency for input strings
        tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
         
        # find unique elements in the input lists and their document frequency 
        local_df = {}
        for element in tf_x:
            local_df[element] = local_df.get(element, 0) + 1
        for element in tf_y:
            local_df[element] = local_df.get(element, 0) + 1

        # if corpus is not provided treat input string as corpus
        curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else (
                                   (self.__document_frequency, self.__corpus_size))

        idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = (0.0, 0.0, 0.0, 
                                                      0.0, 0.0, 0.0)

        # tfidf calculation
        for element in local_df.keys():
            df_element = curr_df.get(element)
            if df_element is None:
                continue
            idf_element = corpus_size * 1.0 / df_element
            v_x = 0 if element not in tf_x else (log(idf_element) * log(tf_x[element] + 1)) if self.dampen else (
                  idf_element * tf_x[element])
            v_y = 0 if element not in tf_y else (log(idf_element) * log(tf_y[element] + 1)) if self.dampen else (
                  idf_element * tf_y[element])
            v_x_y += v_x * v_y
            v_x_2 += v_x * v_x
            v_y_2 += v_y * v_y

        return 0.0 if v_x_y == 0 else v_x_y / (sqrt(v_x_2) * sqrt(v_y_2))
Esempio n. 28
0
    def get_raw_score(self, bag1, bag2):
        """Computes the raw soft TF/IDF score between two lists given the corpus information.

        Args:
            bag1,bag2 (list): Input lists

        Returns:
            Soft TF/IDF score between the input lists (float).

        Raises:
            TypeError : If the inputs are not lists or if one of the inputs is None.

        Examples:
            >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=Jaro().get_raw_score, threshold=0.8)
            >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c'])
            0.17541160386140586
            >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9)
            >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.5547001962252291
            >>> soft_tfidf = SoftTfIdf([['x', 'y'], ['w'], ['q']])
            >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.0
            >>> soft_tfidf = SoftTfIdf(sim_func=Affine().get_raw_score, threshold=0.6)
            >>> soft_tfidf.get_raw_score(['aa', 'bb', 'a'], ['ab', 'ba'])
            0.81649658092772592

        References:
            * the string matching chapter of the "Principles of Data Integration" book.
        """
        
        # input validations
        utils.sim_check_for_none(bag1, bag2)
        utils.sim_check_for_list_or_set_inputs(bag1, bag2)

        # if the strings match exactly return 1.0
        if utils.sim_check_for_exact_match(bag1, bag2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(bag1, bag2):
            return 0

        # term frequency for input strings
        tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
        
        # find unique elements in the input lists and their document frequency 
        local_df = {}
        for element in tf_x:
            local_df[element] = local_df.get(element, 0) + 1
        for element in tf_y:
            local_df[element] = local_df.get(element, 0) + 1

        # if corpus is not provided treat input string as corpus
        curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else (
                                   (self.__document_frequency, self.__corpus_size))

        # calculating the term sim score against the input string 2,
        # construct similarity map
        similarity_map = {}
        for term_x in tf_x:
            max_score = 0.0
            for term_y in tf_y:
                score = self.sim_func(term_x, term_y)
                # adding sim only if it is above threshold and
                # highest for this element
                if score > self.threshold and score > max_score:
                    similarity_map[term_x] = (term_x, term_y, score)
                    max_score = score

        # position of first string, second string and sim score
        # in the tuple
        first_string_pos = 0
        second_string_pos = 1
        sim_score_pos = 2

        result, v_x_2, v_y_2 = 0.0, 0.0, 0.0
        # soft-tfidf calculation
        for element in local_df.keys():
            if curr_df.get(element) is None:
                continue
            # numerator
            if element in similarity_map:
                sim = similarity_map[element]
                idf_first = corpus_size / curr_df.get(sim[first_string_pos], 1)
                idf_second = corpus_size / curr_df.get(sim[second_string_pos], 1)
                v_x = log(idf_first) * log(tf_x.get(sim[first_string_pos], 0) + 1) if self.dampen else idf_first * tf_x.get(sim[first_string_pos], 0)
                v_y = log(idf_second) * log(tf_y.get(sim[second_string_pos], 0) + 1) if self.dampen else idf_second * tf_y.get(sim[second_string_pos], 0)
                result += v_x * v_y * sim[sim_score_pos]
            # denominator
            idf = corpus_size / curr_df[element]
            v_x = log(idf) * log(tf_x.get(element, 0) + 1) if self.dampen else idf * tf_x.get(element, 0)
            v_x_2 += v_x * v_x
            v_y = log(idf) * log(tf_y.get(element, 0) + 1)  if self.dampen else idf * tf_y.get(element, 0)
            v_y_2 += v_y * v_y
        return result if v_x_2 == 0 else result / (sqrt(v_x_2) * sqrt(v_y_2))
Esempio n. 29
0
    def get_raw_score(self, string1, string2):
        """
        Computes the editex distance between two strings.

        As described on pages 3 & 4 of
        Zobel, Justin and Philip Dart. 1996. Phonetic string matching: Lessons from
        information retrieval. In: Proceedings of the ACM-SIGIR Conference on
        Research and Development in Information Retrieval, Zurich, Switzerland.
        166–173. http://goanna.cs.rmit.edu.au/~jz/fulltext/sigir96.pdf

        The local variant is based on
        Ring, Nicholas and Alexandra L. Uitdenbogerd. 2009. Finding ‘Lucy in
        Disguise’: The Misheard Lyric Matching Problem. In: Proceedings of the 5th
        Asia Information Retrieval Symposium, Sapporo, Japan. 157-167.
        http://www.seg.rmit.edu.au/research/download.php?manuscript=404

        Args:
            string1,string2 (str): Input strings

        Returns:
            Editex distance (int)

        Raises:
            TypeError : If the inputs are not strings

        Examples:
            >>> ed = Editex()
            >>> ed.get_raw_score('cat', 'hat')
            2
            >>> ed.get_raw_score('Niall', 'Neil')
            2
            >>> ed.get_raw_score('aluminum', 'Catalan')
            12
            >>> ed.get_raw_score('ATCG', 'TAGC')
            6

        References:
            * Abydos Library - https://github.com/chrislit/abydos/blob/master/abydos/distance.py

        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)
        if utils.sim_check_for_exact_match(string1, string2):
            return 0

        # convert both the strings to NFKD normalized unicode
        string1 = unicodedata.normalize('NFKD', text_type(string1.upper()))
        string2 = unicodedata.normalize('NFKD', text_type(string2.upper()))

        # convert ß to SS (for Python2)
        string1 = string1.replace('ß', 'SS')
        string2 = string2.replace('ß', 'SS')

        if len(string1) == 0:
            return len(string2) * self.mismatch_cost
        if len(string2) == 0:
            return len(string1) * self.mismatch_cost

        d_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.int)
        len1 = len(string1)
        len2 = len(string2)
        string1 = ' ' + string1
        string2 = ' ' + string2
        editex_helper = EditexHelper(self.match_cost, self.mismatch_cost,
                                     self.group_cost)

        if not self.local:
            for i in xrange(1, len1 + 1):
                d_mat[i, 0] = d_mat[i - 1, 0] + editex_helper.d_cost(
                                                    string1[i - 1], string1[i])

        for j in xrange(1, len2 + 1):
            d_mat[0, j] = d_mat[0, j - 1] + editex_helper.d_cost(string2[j - 1],
                                                                 string2[j])

        for i in xrange(1, len1 + 1):
            for j in xrange(1, len2 + 1):
                d_mat[i, j] = min(d_mat[i - 1, j] + editex_helper.d_cost(
                                                    string1[i - 1], string1[i]),
                                  d_mat[i, j - 1] + editex_helper.d_cost(
                                                    string2[j - 1], string2[j]),
                                  d_mat[i - 1, j - 1] + editex_helper.r_cost(
                                                        string1[i], string2[j]))

        return d_mat[len1, len2]
Esempio n. 30
0
def tfidf(bag1, bag2, corpus_list=None, dampen=False):
    """
    Compute tfidf measures between two lists given the corpus information.
    This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to find documents that
    are relevant to keyword queries.
    The intuition underlying the TF/IDF measure is that two strings are similar if they share distinguishing terms.

    Args:
        bag1,bag2 (list): Input lists

        corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None,
            the input list are considered the only corpus.

        dampen (boolean): Flag to indicate whether 'log' should be applied to tf and idf measure.

    Returns:
        TF-IDF measure between the input lists (float)

    Raises:
        TypeError : If the inputs are not lists or if one of the inputs is None


    Examples:
        >>> tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']])
        0.17541160386140586
        >>> tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], True)
        0.11166746710505392
        >>> tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']])
        0.5547001962252291
        >>> tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']])
        0.0
        >>> tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']], True)
        0.0
        >>> tfidf(['a', 'b', 'a'], ['a'])
        0.7071067811865475
    """
    # input validations
    utils.sim_check_for_none(bag1, bag2)
    utils.sim_check_for_list_or_set_inputs(bag1, bag2)
    # if the strings match exactly return 1.0
    if utils.sim_check_for_exact_match(bag1, bag2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(bag1, bag2):
        return 0
    # if corpus is not provided treat input string as corpus
    if corpus_list is None:
        corpus_list = [bag1, bag2]
    corpus_size = len(corpus_list)
    # term frequency for input strings
    tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
    # number of documents an element appeared
    element_freq = {}
    # set of unique element
    total_unique_elements = set()
    for document in corpus_list:
        temp_set = set()
        for element in document:
            # adding element only if it is present in one of two input string
            if element in bag1 or element in bag2:
                temp_set.add(element)
                total_unique_elements.add(element)
        # update element document frequency for this document
        for element in temp_set:
            element_freq[element] = element_freq[element] + 1 if element in element_freq else 1
    idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
    # tfidf calculation
    for element in total_unique_elements:
        idf_element = corpus_size * 1.0 / element_freq[element]
        v_x = 0 if element not in tf_x else (math.log(idf_element) * math.log(tf_x[element] + 1)) if dampen else (
            idf_element * tf_x[element])
        v_y = 0 if element not in tf_y else (math.log(idf_element) * math.log(tf_y[element] + 1)) if dampen else (
            idf_element * tf_y[element])
        v_x_y += v_x * v_y
        v_x_2 += v_x * v_x
        v_y_2 += v_y * v_y
    return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
Esempio n. 31
0
    def get_raw_score(self, string1, string2):
        """
        Computes the Soundex phonetic similarity between two strings.

        Phonetic measure such as soundex match string based on their sound. These
        measures have been especially effective in matching names, since names are
        often spelled in different ways that sound the same. For example, Meyer, Meier,
        and Mire sound the same, as do Smith, Smithe, and Smythe.

        Soundex is used primarily to match surnames. It does not work as well for names
        of East Asian origins, because much of the discriminating power of these names
        resides in the vowel sounds, which the code ignores.

        Args:
            string1,string2 (str): Input strings

        Returns:
            Soundex similarity score (int) is returned

        Raises:
            TypeError : If the inputs are not strings

        Examples:
            >>> s = Soundex()
            >>> s.get_raw_score('Robert', 'Rupert')
            1
            >>> s.get_raw_score('Sue', 's')
            1
            >>> s.get_raw_score('Gough', 'Goff')
            0
            >>> s.get_raw_score('a,,li', 'ali')
            1

        """
        # input validations
        utils.sim_check_for_none(string1, string2)
        utils.sim_check_for_string_inputs(string1, string2)

        # remove all chars but alphanumeric characters
        string1 = re.sub("[^a-zA-Z0-9]", "", string1)
        string2 = re.sub("[^a-zA-Z0-9]", "", string2)

        utils.sim_check_for_zero_len(string1, string2)

        if utils.sim_check_for_exact_match(string1, string2):
            return 1

        string1, string2 = string1.upper(), string2.upper()
        first_letter1, first_letter2 = string1[0], string2[0]
        string1, string2 = string1[1:], string2[1:]

        # remove occurrences of vowels, 'y', 'w' and 'h'
        string1 = re.sub('[AEIOUYWH]', '', string1)
        string2 = re.sub('[AEIOUYWH]', '', string2)

        # replace (B,F,P,V)->1 (C,G,J,K,Q,S,X,Z)->2 (D,T)->3 (L)->4
        # (M,N)->5 (R)->6
        string1 = re.sub('[BFPV]', '1', string1)
        string1 = re.sub('[CGJKQSXZ]', '2', string1)
        string1 = re.sub('[DT]', '3', string1)
        string1 = re.sub('[L]', '4', string1)
        string1 = re.sub('[MN]', '5', string1)
        string1 = re.sub('[R]', '6', string1)

        string2 = re.sub('[BFPV]', '1', string2)
        string2 = re.sub('[CGJKQSXZ]', '2', string2)
        string2 = re.sub('[DT]', '3', string2)
        string2 = re.sub('[L]', '4', string2)
        string2 = re.sub('[MN]', '5', string2)
        string2 = re.sub('[R]', '6', string2)

        string1 = first_letter1 + string1[:3]
        string2 = first_letter2 + string2[:3]

        return 1 if string1 == string2 else 0
Esempio n. 32
0
    def get_raw_score(self, bag1, bag2):
        """Computes the raw soft TF/IDF score between two lists given the corpus information.

        Args:
            bag1,bag2 (list): Input lists

        Returns:
            Soft TF/IDF score between the input lists (float).

        Raises:
            TypeError : If the inputs are not lists or if one of the inputs is None.

        Examples:
            >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=Jaro().get_raw_score, threshold=0.8)
            >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c'])
            0.17541160386140586
            >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9)
            >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.5547001962252291
            >>> soft_tfidf = SoftTfIdf([['x', 'y'], ['w'], ['q']])
            >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
            0.0
            >>> soft_tfidf = SoftTfIdf(sim_func=Affine().get_raw_score, threshold=0.6)
            >>> soft_tfidf.get_raw_score(['aa', 'bb', 'a'], ['ab', 'ba'])
            0.81649658092772592

        References:
            * the string matching chapter of the "Principles of Data Integration" book.
        """
        
        # input validations
        utils.sim_check_for_none(bag1, bag2)
        utils.sim_check_for_list_or_set_inputs(bag1, bag2)

        # if the strings match exactly return 1.0
        if utils.sim_check_for_exact_match(bag1, bag2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(bag1, bag2):
            return 0

        # term frequency for input strings
        tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
        
        # find unique elements in the input lists and their document frequency 
        local_df = {}
        for element in tf_x:
            local_df[element] = local_df.get(element, 0) + 1
        for element in tf_y:
            local_df[element] = local_df.get(element, 0) + 1

        # if corpus is not provided treat input string as corpus
        curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else (
                                   (self.__document_frequency, self.__corpus_size))

        # calculating the term sim score against the input string 2,
        # construct similarity map
        similarity_map = {}
        for term_x in tf_x:
            max_score = 0.0
            for term_y in tf_y:
                score = self.sim_func(term_x, term_y)
                # adding sim only if it is above threshold and
                # highest for this element
                if score > self.threshold and score > max_score:
                    similarity_map[term_x] = (term_x, term_y, score)
                    max_score = score

        # position of first string, second string and sim score
        # in the tuple
        first_string_pos = 0
        second_string_pos = 1
        sim_score_pos = 2

        result, v_x_2, v_y_2 = 0.0, 0.0, 0.0
        # soft-tfidf calculation
        for element in local_df.keys():
            if curr_df.get(element) is None:
                continue
            # numerator
            if element in similarity_map:
                sim = similarity_map[element]
                idf_first = corpus_size / curr_df.get(sim[first_string_pos], 1)
                idf_second = corpus_size / curr_df.get(sim[second_string_pos], 1)
                v_x = idf_first * tf_x.get(sim[first_string_pos], 0)
                v_y = idf_second * tf_y.get(sim[second_string_pos], 0)
                result += v_x * v_y * sim[sim_score_pos]
            # denominator
            idf = corpus_size / curr_df[element]
            v_x = idf * tf_x.get(element, 0)
            v_x_2 += v_x * v_x
            v_y = idf * tf_y.get(element, 0)
            v_y_2 += v_y * v_y
        return result if v_x_2 == 0 else result / (sqrt(v_x_2) * sqrt(v_y_2))
Esempio n. 33
0
def generalized_jaccard(set1, set2, sim_func=jaro, threshold=0.5):
    """
    Computes the Generalized Jaccard measure between two sets.

    This similarity measure is softened version of the Jaccard measure. The Jaccard measure is
    promising candidate for tokens which exactly match across the sets. However, in practice tokens
    are often misspelled, such as energy vs. eneryg. THe generalized Jaccard measure will enable
    matching in such cases.

    Args:
        set1,set2 (set or list): Input sets (or lists) of strings. Input lists are converted to sets.
        sim_func (func): similarity function. This should return a similarity score between two strings in set (optional),
            default is jaro similarity measure
        threshold (float): Threshold value (defaults to 0.5). If the similarity of a token pair exceeds the threshold,
        then the token pair is considered a match.

    Returns:
        Generalized Jaccard similarity (float)

    Raises:
        TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
        ValueError : If the similarity measure doesn't return values in the range [0.1]

    Examples:
        >>> generalized_jaccard(['data', 'science'], ['data'])
        0.5
        >>> generalized_jaccard(['data', 'management'], ['data', 'data', 'science'])
        0.3333333333333333
        >>> generalized_jaccard(['Niall'], ['Neal', 'Njall'])
        0.43333333333333335
        >>> generalized_jaccard(['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'],
        ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'],
        sim_func=jaro_winkler, threshold=0.8)
        0.45810185185185187
    """
    # input validations
    utils.sim_check_for_none(set1, set2)
    utils.sim_check_for_list_or_set_inputs(set1, set2)
    # if exact match return 1.0
    if utils.sim_check_for_exact_match(set1, set2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(set1, set2):
        return 0
    if not isinstance(set1, set):
        set1 = set(set1)
    if not isinstance(set2, set):
        set2 = set(set2)
    set1_x = set()
    set2_y = set()
    match_score = 0.0
    match_count = 0
    list_matches = []
    for element in set1:
        for item in set2:
            score = sim_func(element, item)
            if score > 1 or score < 0:
                raise ValueError('Similarity measure should return value in the range [0,1]')
            if score > threshold:
                list_matches.append(utils.Similarity(element, item, score))
    # sort the score of all the pairs
    list_matches.sort(key=lambda x: x.similarity_score, reverse=True)
    # select score in increasing order of their weightage, do not reselect the same element from either set.
    for element in list_matches:
        if element.first_string not in set1_x and element.second_string not in set2_y:
            set1_x.add(element.first_string)
            set2_y.add(element.second_string)
            match_score += element.similarity_score
            match_count += 1
    return float(match_score) / float(len(set1) + len(set2) - match_count)
Esempio n. 34
0
def soundex(string1, string2):
    """
    Computes the Soundex phonetic similarity between two strings.

    Phonetic measure such as soundex match string based on their sound. These
    measures have been especially effective in matching names, since names are
    often spelled in different ways that sound the same. For example, Meyer, Meier,
    and Mire sound the same, as do Smith, Smithe, and Smythe.

    Soundex is used primarily to match surnames. It does not work as well for names
    of East Asian origins, because much of the discriminating power of these names
    resides in the vowel sounds, which the code ignores.

    Args:
        string1,string2 (str): Input strings

    Returns:
        Soundex similarity score (int) is returned

    Raises:
        TypeError : If the inputs are not strings

    Examples:
        >>> soundex('Robert', 'Rupert')
        1
        >>> soundex('Sue', 's')
        1
        >>> soundex('Gough', 'Goff')
        0
        >>> soundex('a,,li', 'ali')
        1

    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)
    if utils.sim_check_for_exact_match(string1, string2):
        return 1
    utils.sim_check_for_zero_len(string1, string2)
    string1, string2 = string1.upper(), string2.upper()
    firstLetter1, firstLetter2 = string1[0], string2[0]
    string1, string2 = string1[1:], string2[1:]
    # remove occurrences of vowels, 'y', 'w' and 'h'
    string1 = re.sub('[AEIOUYWH]', '', string1)
    string2 = re.sub('[AEIOUYWH]', '', string2)

    # replace (B,F,P,V)->1 (C,G,J,K,Q,S,X,Z)->2 (D,T)->3 (L)->4 (M,N)->5 (R)->6
    string1 = re.sub('[BFPV]', '1', string1)
    string1 = re.sub('[CGJKQSXZ]', '2', string1)
    string1 = re.sub('[DT]', '3', string1)
    string1 = re.sub('[L]', '4', string1)
    string1 = re.sub('[MN]', '5', string1)
    string1 = re.sub('[R]', '6', string1)

    string2 = re.sub('[BFPV]', '1', string2)
    string2 = re.sub('[CGJKQSXZ]', '2', string2)
    string2 = re.sub('[DT]', '3', string2)
    string2 = re.sub('[L]', '4', string2)
    string2 = re.sub('[MN]', '5', string2)
    string2 = re.sub('[R]', '6', string2)

    # remove all chars but digits
    string1 = re.sub("\D", "", string1)
    string2 = re.sub("\D", "", string2)

    string1 = firstLetter1 + string1[:3]
    string2 = firstLetter2 + string2[:3]
    return 1 if string1 == string2 else 0
Esempio n. 35
0
def editex(string1, string2, match_cost=0, group_cost=1, mismatch_cost=2, local=False):
    """
    Computes the editex distance between two strings.

    As described on pages 3 & 4 of
    Zobel, Justin and Philip Dart. 1996. Phonetic string matching: Lessons from
    information retrieval. In: Proceedings of the ACM-SIGIR Conference on
    Research and Development in Information Retrieval, Zurich, Switzerland.
    166–173. http://goanna.cs.rmit.edu.au/~jz/fulltext/sigir96.pdf

    The local variant is based on
    Ring, Nicholas and Alexandra L. Uitdenbogerd. 2009. Finding ‘Lucy in
    Disguise’: The Misheard Lyric Matching Problem. In: Proceedings of the 5th
    Asia Information Retrieval Symposium, Sapporo, Japan. 157-167.
    http://www.seg.rmit.edu.au/research/download.php?manuscript=404

    Args:
        string1,string2 (str): Input strings
        match_cost (int): Weight to give the correct char match, default=0
        group_cost (int): Weight to give if the chars are in the same editex group, default=1
        mismatch_cost (int): Weight to give the incorrect char match, default=2
        local (boolean): Local variant on/off, default=False

    Returns:
        Editex distance (int)

    Raises:
        TypeError : If the inputs are not strings

    Examples:
        >>> editex('cat', 'hat')
        2
        >>> editex('Niall', 'Neil')
        2
        >>> editex('aluminum', 'Catalan')
        12
        >>> editex('ATCG', 'TAGC')
        6

    References:
        * Abydos Library - https://github.com/chrislit/abydos/blob/master/abydos/distance.py
    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)
    if utils.sim_check_for_exact_match(string1, string2):
        return 0
    # convert both the strings to NFKD normalized unicode
    string1 = unicodedata.normalize('NFKD', _unicode(string1.upper()))
    string2 = unicodedata.normalize('NFKD', _unicode(string2.upper()))
    # convert ß to SS (for Python2)
    string1 = string1.replace('ß', 'SS')
    string2 = string2.replace('ß', 'SS')

    if string1 == string2:
        return 0
    if len(string1) == 0:
        return len(string2) * mismatch_cost
    if len(string2) == 0:
        return len(string1) * mismatch_cost

    d_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.int)
    len1 = len(string1)
    len2 = len(string2)
    string1 = ' ' + string1
    string2 = ' ' + string2
    editex_helper = utils.Editex(match_cost, mismatch_cost, group_cost)
    if not local:
        for i in _range(1, len1 + 1):
            d_mat[i, 0] = d_mat[i - 1, 0] + editex_helper.d_cost(string1[i - 1], string1[i])
    for j in _range(1, len2 + 1):
        d_mat[0, j] = d_mat[0, j - 1] + editex_helper.d_cost(string2[j - 1], string2[j])

    for i in _range(1, len1 + 1):
        for j in _range(1, len2 + 1):
            d_mat[i, j] = min(d_mat[i - 1, j] + editex_helper.d_cost(string1[i - 1], string1[i]),
                              d_mat[i, j - 1] + editex_helper.d_cost(string2[j - 1], string2[j]),
                              d_mat[i - 1, j - 1] + editex_helper.r_cost(string1[i], string2[j]))

    return d_mat[len1, len2]
Esempio n. 36
0
def soft_tfidf(bag1, bag2, corpus_list=None, sim_func=jaro, threshold=0.5):
    """
    Compute Soft-tfidf measures between two lists given the corpus information.

    Args:
        bag1,bag2 (list): Input lists

        corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None,
            the input list are considered the only corpus

        sim_func (func): Secondary similarity function. This should return a similarity score between two strings (optional),
            default is jaro similarity measure

        threshold (float): Threshold value for the secondary similarity function (defaults to 0.5). If the similarity
            of a token pair exceeds the threshold, then the token pair is considered a match.

    Returns:
        Soft TF-IDF measure between the input lists

    Raises:
        TypeError : If the inputs are not lists or if one of the inputs is None.

    Examples:
        >>> soft_tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=jaro, threshold=0.8)
        0.17541160386140586
        >>> soft_tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9)
        0.5547001962252291
        >>> soft_tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']])
        0.0
        >>> soft_tfidf(['aa', 'bb', 'a'], ['ab', 'ba'], sim_func=affine, threshold=0.6)
        0.81649658092772592

    References:
        * Principles of Data Integration book
    """
    # input validations
    utils.sim_check_for_none(bag1, bag2)
    utils.sim_check_for_list_or_set_inputs(bag1, bag2)
    # if the strings match exactly return 1.0
    if utils.sim_check_for_exact_match(bag1, bag2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(bag1, bag2):
        return 0
    # if corpus is not provided treat input string as corpus
    if corpus_list is None:
        corpus_list = [bag1, bag2]
    corpus_size = len(corpus_list) * 1.0
    # term frequency for input strings
    tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
    # number of documents an element appeared
    element_freq = {}
    # set of unique element
    total_unique_elements = set()
    for document in corpus_list:
        temp_set = set()
        for element in document:
            # adding element only if it is present in one of two input string
            if element in bag1 or element in bag2:
                temp_set.add(element)
                total_unique_elements.add(element)
        # update element document frequency for this document
        for element in temp_set:
            element_freq[element] = element_freq[element] + 1 if element in element_freq else 1
    similarity_map = {}
    # calculating the term sim score against the input string 2, construct similarity map
    for x in bag1:
        if x not in similarity_map:
            max_score = 0.0
            for y in bag2:
                score = sim_func(x, y)
                # adding sim only if it is above threshold and highest for this element
                if score > threshold and score > max_score:
                    similarity_map[x] = utils.Similarity(x, y, score)
                    max_score = score
    result, v_x_2, v_y_2 = 0.0, 0.0, 0.0
    # soft-tfidf calculation
    for element in total_unique_elements:
        # numerator
        if element in similarity_map:
            sim = similarity_map[element]
            idf_first = corpus_size if sim.first_string not in element_freq else corpus_size / \
                                                                                 element_freq[sim.first_string]
            idf_second = corpus_size if sim.second_string not in element_freq else corpus_size / \
                                                                                   element_freq[sim.second_string]
            v_x = 0 if sim.first_string not in tf_x else idf_first * tf_x[sim.first_string]
            v_y = 0 if sim.second_string not in tf_y else idf_second * tf_y[sim.second_string]
            result += v_x * v_y * sim.similarity_score
        # denominator
        idf = corpus_size if element not in element_freq else corpus_size / element_freq[element]
        v_x = 0 if element not in tf_x else idf * tf_x[element]
        v_x_2 += v_x * v_x
        v_y = 0 if element not in tf_y else idf * tf_y[element]
        v_y_2 += v_y * v_y
    return result if v_x_2 == 0 else result / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
    def get_raw_score(self, set1, set2):
        """
        Computes the Generalized Jaccard measure between two sets.

        This similarity measure is softened version of the Jaccard measure. The Jaccard measure is
        promising candidate for tokens which exactly match across the sets. However, in practice tokens
        are often misspelled, such as energy vs. eneryg. THe generalized Jaccard measure will enable
        matching in such cases.

        Args:
            set1,set2 (set or list): Input sets (or lists) of strings. Input lists are converted to sets.

        Returns:
            Generalized Jaccard similarity (float)

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
            ValueError : If the similarity measure doesn't return values in the range [0,1]

        Examples:
            >>> gj = GeneralizedJaccard()
            >>> gj.get_raw_score(['data', 'science'], ['data'])
            0.5
            >>> gj.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
            0.3333333333333333
            >>> gj.get_raw_score(['Niall'], ['Neal', 'Njall'])
            0.43333333333333335
            >>> gj = GeneralizedJaccard(sim_func=JaroWinkler().get_raw_score, threshold=0.8)
            >>> gj.get_raw_score(['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'],
                                 ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
            0.45810185185185187
        """
        # input validations
        utils.sim_check_for_none(set1, set2)
        utils.sim_check_for_list_or_set_inputs(set1, set2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(set1, set2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(set1, set2):
            return 0

        if not isinstance(set1, set):
            set1 = set(set1)
        if not isinstance(set2, set):
            set2 = set(set2)

        set1_x = set()
        set2_y = set()
        match_score = 0.0
        match_count = 0
        list_matches = []
        for element in set1:
            for item in set2:
                score = self.sim_func(element, item)
                if score > 1 or score < 0:
                    raise ValueError('Similarity measure should' + \
                                     ' return value in the range [0,1]')
                if score > self.threshold:
                    list_matches.append((element, item, score))

        # position of first string, second string and sim score in tuple
        first_string_pos = 0
        second_string_pos = 1
        sim_score_pos = 2

        # sort the score of all the pairs
        list_matches.sort(key=lambda x: x[sim_score_pos], reverse=True)

        # select score in increasing order of their weightage,
        # do not reselect the same element from either set.
        for element in list_matches:
            if (element[first_string_pos] not in set1_x
                    and element[second_string_pos] not in set2_y):
                set1_x.add(element[first_string_pos])
                set2_y.add(element[second_string_pos])
                match_score += element[sim_score_pos]
                match_count += 1

        return float(match_score) / float(len(set1) + len(set2) - match_count)
def levenshtein(string1, string2):
    """
    Computes the Levenshtein distance between two strings.

    Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string
    is carried out using a sequence of the following operators: delete a character, insert a character, and
    substitute one character for another.

    Args:
        string1,string2 (str): Input strings

    Returns:
        Levenshtein distance (int)

    Raises:
        TypeError : If the inputs are not strings

    Examples:
        >>> levenshtein('a', '')
        1
        >>> levenshtein('example', 'samples')
        3
        >>> levenshtein('levenshtein', 'frankenstein')
        6



    """
    # input validations
    utils.sim_check_for_none(string1, string2)
    utils.sim_check_for_string_inputs(string1, string2)
    if utils.sim_check_for_exact_match(string1, string2):
        return 0.0

    ins_cost, del_cost, sub_cost, trans_cost = (1, 1, 1, 1)

    len_str1 = len(string1)
    len_str2 = len(string2)

    if len_str1 == 0:
        return len_str2 * ins_cost

    if len_str2 == 0:
        return len_str1 * del_cost

    d_mat = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.int)

    for i in _range(len_str1 + 1):
        d_mat[i, 0] = i * del_cost

    for j in _range(len_str2 + 1):
        d_mat[0, j] = j * ins_cost

    for i in _range(len_str1):
        for j in _range(len_str2):
            d_mat[i + 1, j + 1] = min(
                d_mat[i + 1, j] + ins_cost,
                d_mat[i, j + 1] + del_cost,
                d_mat[i, j] + (sub_cost if string1[i] != string2[j] else 0)
            )

    return d_mat[len_str1, len_str2]
Esempio n. 39
0
def soft_tfidf(bag1, bag2, corpus_list=None, sim_func=jaro, threshold=0.5):
    """
    Compute Soft-tfidf measures between two lists given the corpus information.

    Args:
        bag1,bag2 (list): Input lists

        corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None,
            the input list are considered the only corpus

        sim_func (func): Secondary similarity function. This should return a similarity score between two strings (optional),
            default is jaro similarity measure

        threshold (float): Threshold value for the secondary similarity function (defaults to 0.5). If the similarity
            of a token pair exceeds the threshold, then the token pair is considered a match.

    Returns:
        Soft TF-IDF measure between the input lists

    Raises:
        TypeError : If the inputs are not lists or if one of the inputs is None.

    Examples:
        >>> soft_tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=jaro, threshold=0.8)
        0.17541160386140586
        >>> soft_tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9)
        0.5547001962252291
        >>> soft_tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']])
        0.0
        >>> soft_tfidf(['aa', 'bb', 'a'], ['ab', 'ba'], sim_func=affine, threshold=0.6)
        0.81649658092772592

    References:
        * Principles of Data Integration book
    """
    # input validations
    utils.sim_check_for_none(bag1, bag2)
    utils.sim_check_for_list_or_set_inputs(bag1, bag2)
    # if the strings match exactly return 1.0
    if utils.sim_check_for_exact_match(bag1, bag2):
        return 1.0
    # if one of the strings is empty return 0
    if utils.sim_check_for_empty(bag1, bag2):
        return 0
    # if corpus is not provided treat input string as corpus
    if corpus_list is None:
        corpus_list = [bag1, bag2]
    corpus_size = len(corpus_list) * 1.0
    # term frequency for input strings
    tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
    # number of documents an element appeared
    element_freq = {}
    # set of unique element
    total_unique_elements = set()
    for document in corpus_list:
        temp_set = set()
        for element in document:
            # adding element only if it is present in one of two input string
            if element in bag1 or element in bag2:
                temp_set.add(element)
                total_unique_elements.add(element)
        # update element document frequency for this document
        for element in temp_set:
            element_freq[element] = element_freq[element] + 1 if element in element_freq else 1
    similarity_map = {}
    # calculating the term sim score against the input string 2, construct similarity map
    for x in bag1:
        if x not in similarity_map:
            max_score = 0.0
            for y in bag2:
                score = sim_func(x, y)
                # adding sim only if it is above threshold and highest for this element
                if score > threshold and score > max_score:
                    similarity_map[x] = utils.Similarity(x, y, score)
                    max_score = score
    result, v_x_2, v_y_2 = 0.0, 0.0, 0.0
    # soft-tfidf calculation
    for element in total_unique_elements:
        # numerator
        if element in similarity_map:
            sim = similarity_map[element]
            idf_first = corpus_size if sim.first_string not in element_freq else corpus_size / \
                                                                                 element_freq[sim.first_string]
            idf_second = corpus_size if sim.second_string not in element_freq else corpus_size / \
                                                                                   element_freq[sim.second_string]
            v_x = 0 if sim.first_string not in tf_x else idf_first * tf_x[sim.first_string]
            v_y = 0 if sim.second_string not in tf_y else idf_second * tf_y[sim.second_string]
            result += v_x * v_y * sim.similarity_score
        # denominator
        idf = corpus_size if element not in element_freq else corpus_size / element_freq[element]
        v_x = 0 if element not in tf_x else idf * tf_x[element]
        v_x_2 += v_x * v_x
        v_y = 0 if element not in tf_y else idf * tf_y[element]
        v_y_2 += v_y * v_y
    return result if v_x_2 == 0 else result / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
    def get_raw_score(self, set1, set2):
        """
        Computes the Generalized Jaccard measure between two sets.

        This similarity measure is softened version of the Jaccard measure. The Jaccard measure is
        promising candidate for tokens which exactly match across the sets. However, in practice tokens
        are often misspelled, such as energy vs. eneryg. THe generalized Jaccard measure will enable
        matching in such cases.

        Args:
            set1,set2 (set or list): Input sets (or lists) of strings. Input lists are converted to sets.

        Returns:
            Generalized Jaccard similarity (float)

        Raises:
            TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
            ValueError : If the similarity measure doesn't return values in the range [0,1]

        Examples:
            >>> gj = GeneralizedJaccard()
            >>> gj.get_raw_score(['data', 'science'], ['data'])
            0.5
            >>> gj.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
            0.3333333333333333
            >>> gj.get_raw_score(['Niall'], ['Neal', 'Njall'])
            0.43333333333333335
            >>> gj = GeneralizedJaccard(sim_func=JaroWinkler().get_raw_score, threshold=0.8)
            >>> gj.get_raw_score(['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'],
                                 ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
            0.45810185185185187
        """
        # input validations
        utils.sim_check_for_none(set1, set2)
        utils.sim_check_for_list_or_set_inputs(set1, set2)

        # if exact match return 1.0
        if utils.sim_check_for_exact_match(set1, set2):
            return 1.0

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(set1, set2):
            return 0

        if not isinstance(set1, set):
            set1 = set(set1)
        if not isinstance(set2, set):
            set2 = set(set2)

        set1_x = set()
        set2_y = set()
        match_score = 0.0
        match_count = 0
        list_matches = []
        for element in set1:
            for item in set2:
                score = self.sim_func(element, item)
                if score > 1 or score < 0:
                    raise ValueError('Similarity measure should' + \
                                     ' return value in the range [0,1]')
                if score > self.threshold:
                    list_matches.append((element, item, score))

        # position of first string, second string and sim score in tuple
        first_string_pos = 0
        second_string_pos = 1
        sim_score_pos = 2

        # sort the score of all the pairs
        list_matches.sort(key=lambda x: x[sim_score_pos], reverse=True)

        # select score in increasing order of their weightage, 
        # do not reselect the same element from either set.
        for element in list_matches:
            if (element[first_string_pos] not in set1_x and
                element[second_string_pos] not in set2_y):
                set1_x.add(element[first_string_pos])
                set2_y.add(element[second_string_pos])
                match_score += element[sim_score_pos]
                match_count += 1

        return float(match_score) / float(len(set1) + len(set2) - match_count)