def spearman_rho(worder, normalize=True):
    """
    Calculates the Spearman's Rho correlation coefficient given the *worder*
    list of word alignment from word_rank_alignment(), using the formula:

        rho = 1 - sum(d**2) / choose(len(worder)+1, 3)

    Given that d is the sum of difference between the *worder* list of indices
    and the original word indices from the reference sentence.

    Using the (H0,R0) and (H5, R5) example from the paper

        >>> worder =  [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
        >>> round(spearman_rho(worder, normalize=False), 3)
        -0.591
        >>> round(spearman_rho(worder), 3)
        0.205

    :param worder: The worder list output from word_rank_alignment
    :param type: list(int)
    """
    worder_len = len(worder)
    sum_d_square = sum((wi - i)**2 for wi, i in zip(worder, range(worder_len)))
    rho = 1 - sum_d_square / choose(worder_len + 1, 3)

    if normalize:  # If normalized, the rho output falls between 0.0 to 1.0
        return (rho + 1) / 2
    else:  # Otherwise, the rho outputs falls between -1.0 to +1.0
        return rho
Example #2
0
def spearman_rho(worder, normalize=True):
    """
    Calculates the Spearman's Rho correlation coefficient given the *worder* 
    list of word alignment from word_rank_alignment(), using the formula:
    
        rho = 1 - sum(d**2) / choose(len(worder)+1, 3)  
        
    Given that d is the sum of difference between the *worder* list of indices
    and the original word indices from the reference sentence.
    
    Using the (H0,R0) and (H5, R5) example from the paper
    
        >>> worder =  [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
        >>> round(spearman_rho(worder, normalize=False), 3)
        -0.591
        >>> round(spearman_rho(worder), 3)
        0.205
    
    :param worder: The worder list output from word_rank_alignment
    :param type: list(int)
    """
    worder_len = len(worder)
    sum_d_square = sum((wi - i)**2 for wi, i in zip(worder, range(worder_len)))
    rho = 1 - sum_d_square / choose(worder_len+1, 3)
    
    if normalize: # If normalized, the rho output falls between 0.0 to 1.0
        return (rho + 1) /2
    else: # Otherwise, the rho outputs falls between -1.0 to +1.0
        return rho
Example #3
0
def kendall_tau(worder, normalize=True):
    """
    Calculates the Kendall's Tau correlation coefficient given the *worder*
    list of word alignments from word_rank_alignment(), using the formula:

        tau = 2 * num_increasing_pairs / num_possible_pairs -1

    Note that the no. of increasing pairs can be discontinuous in the *worder*
    list and each each increasing sequence can be tabulated as choose(len(seq), 2)
    no. of increasing pairs, e.g.

        >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
        >>> number_possible_pairs = choose(len(worder), 2)
        >>> round(kendall_tau(worder, normalize=False),3)
        -0.236
        >>> round(kendall_tau(worder),3)
        0.382

    :param worder: The worder list output from word_rank_alignment
    :type worder: list(int)
    :param normalize: Flag to indicate normalization to between 0.0 and 1.0.
    :type normalize: boolean
    :return: The Kendall's Tau correlation coefficient.
    :rtype: float
    """
    worder_len = len(worder)
    # With worder_len < 2, `choose(worder_len, 2)` will be 0.
    # As we divide by this, it will give a ZeroDivisionError.
    # To avoid this, we can just return the lowest possible score.
    if worder_len < 2:
        tau = -1
    else:
        # Extract the groups of increasing/monotonic sequences.
        increasing_sequences = find_increasing_sequences(worder)
        # Calculate no. of increasing_pairs in *worder* list.
        num_increasing_pairs = sum(
            choose(len(seq), 2) for seq in increasing_sequences)
        # Calculate no. of possible pairs.
        num_possible_pairs = choose(worder_len, 2)
        # Kendall's Tau computation.
        tau = 2 * num_increasing_pairs / num_possible_pairs - 1
    if normalize:  # If normalized, the tau output falls between 0.0 to 1.0
        return (tau + 1) / 2
    else:  # Otherwise, the tau outputs falls between -1.0 to +1.0
        return tau
Example #4
0
def kendall_tau(worder, normalize=True):
    """
    Calculates the Kendall's Tau correlation coefficient given the *worder*
    list of word alignments from word_rank_alignment(), using the formula:
    
        tau = 2 * num_increasing_pairs / num_possible pairs -1
    
    Note that the no. of increasing pairs can be discontinuous in the *worder*
    list and each each increasing sequence can be tabulated as choose(len(seq), 2) 
    no. of increasing pairs, e.g.
    
        >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5]
        >>> number_possible_pairs = choose(len(worder), 2)
        >>> round(kendall_tau(worder, normalize=False),3)
        -0.236
        >>> round(kendall_tau(worder),3)
        0.382
    
    :param worder: The worder list output from word_rank_alignment
    :type worder: list(int)
    :param normalize: Flag to indicate normalization
    :type normalize: boolean
    :return: The Kendall's Tau correlation coefficient.
    :rtype: float
    """
    worder_len = len(worder)
    # Extract the groups of increasing/monotonic sequences.
    increasing_sequences = find_increasing_sequences(worder)
    # Calculate no. of increasing_pairs in *worder* list.
    num_increasing_pairs = sum(choose(len(seq),2) for seq in increasing_sequences) 
    # Calculate no. of possible pairs.
    num_possible_pairs = choose(worder_len, 2)
    # Kendall's Tau computation.
    tau = 2 * num_increasing_pairs / num_possible_pairs -1
    if normalize: # If normalized, the tau output falls between 0.0 to 1.0
        return (tau + 1) /2
    else: # Otherwise, the tau outputs falls between -1.0 to +1.0
        return tau