Ejemplo n.º 1
0
 def test_correlation_matrix(self):
     """Correlations in matrix should match values from R"""
     a = [2, 4, 6, 8]
     b = [1.5, 1.4, 1.2, 1.1]
     c = [15, 10, 5, 20]
     m = correlation_matrix([a, b, c])
     self.assertFloatEqual(m[0, 0], [1.0])
     self.assertFloatEqual([m[1, 0], m[1, 1]], [correlation(b, a)[0], 1.0])
     self.assertFloatEqual(m[2], [correlation(c,a)[0], correlation(c,b)[0], \
         1.0])
Ejemplo n.º 2
0
 def test_correlation_matrix(self):
     """Correlations in matrix should match values from R"""
     a = [2,4,6,8]
     b = [1.5, 1.4, 1.2, 1.1]
     c = [15, 10, 5, 20]
     m = correlation_matrix([a,b,c])
     self.assertFloatEqual(m[0,0], [1.0])
     self.assertFloatEqual([m[1,0], m[1,1]], [correlation(b,a)[0], 1.0])
     self.assertFloatEqual(m[2], [correlation(c,a)[0], correlation(c,b)[0], \
         1.0])
Ejemplo n.º 3
0
def evaluate_test_dataset(observed_table,expected_table):
    """ evaluate the correlation between an observed and expected
    biom table.

    Returns data points for a scatter plot of observed v. expected values,
    and a dict of correlations keyed by method (each containing the r value,
    then the probability)
    """
    # identify the overlapping otus that can be used to predict metagenomes
    overlapping_ids = list(set(observed_table.ids(axis='observation')) &
                            set(expected_table.ids(axis='observation')))

    if len(overlapping_ids) < 1:
        print "obs ids:",observed_table.ids(axis='observation')[0:10]
        print "exp ids:",expected_table.ids(axis='observation')[0:10]

        raise ValueError,\
         "No ids are in common  between the observed and expected tables, so no evaluations can be performed."

    # create lists to contain filtered data - we're going to need the data in
    # numpy arrays, so it makes sense to compute this way rather than filtering
    # the tables
    obs_data = []
    exp_data = []

    # build lists of filtered data
    for obs_id in overlapping_ids:
        obs_data.append(observed_table.data(obs_id, axis='observation'))
        exp_data.append(expected_table.data(obs_id, axis='observation'))

    flat_obs_data = ravel(array(obs_data))
    flat_exp_data = ravel(array(exp_data))

    #GET THE SCATTER PLOT POINTS
    scatter_data_points =\
      zip(flat_obs_data,flat_exp_data)

    correlations = {}
    if len(scatter_data_points) <= 2:
        #can't validly calc correlation
        correlations["pearson"] = (None,None)
        correlations["spearman"] = (None,None)
        return scatter_data_points,correlations
    # CALCULATE CORRELATIONS


    pearson_r,pearson_t_prob =\
      correlation(flat_obs_data,flat_exp_data)

    correlations["pearson"] = (pearson_r,pearson_t_prob)
    pearson_r2 = pearson_r**2
    correlations["pearson_r2"] = [pearson_r2]
    spearman_r,spearman_t_prob =\
      spearman_correlation(flat_obs_data,flat_exp_data)

    correlations["spearman"] = (spearman_r,spearman_t_prob)
    spearman_r2 = spearman_r**2
    correlations["spearman_r2"] = [spearman_r2]

    return scatter_data_points,correlations
Ejemplo n.º 4
0
def run_single_correlation(OTU, category_info, otu_sample_info):
    """runs pearson correlation  on the designated OTU
    """
    result = {}
    #get a list of values for each category
    OTU_abundance_values = []
    category_values = []
    sample_info = otu_sample_info[OTU]
    for sample in category_info:
        # even if this OTU is not observed, we can use count=0
        if sample in sample_info:
            count = sample_info[sample]
        else:
            count = 0
        try:
            cat_val = float(category_info[sample])
            category_values.append(cat_val)
            OTU_abundance_values.append(float(count))
        except ValueError:
            raise ValueError(
                "The category values must be numeric to use the correlation option"
            )
    r, prob = correlation(Numbers(OTU_abundance_values),
                          Numbers(category_values))
    return r, prob
def run_single_correlation(OTU_abundance_values, category_values, \
    filter=1):
    """runs pearson correlation  on the designated OTU
    """
    number_samples = len(category_values)
    if len(category_values) >= int(filter):
        r, prob = correlation(Numbers(category_values), \
            Numbers(OTU_abundance_values))
        return r, prob
    else:
        return None, None
Ejemplo n.º 6
0
 def test_compare(self):
     """compares internal asa to stride."""
     self.input_file = os.path.join('data', '2E12.pdb')
     self.input_structure = PDBParser(open(self.input_file))
     try:
         asa.asa_xtra(self.input_structure, mode='stride')
     except ApplicationNotFoundError:
         return
     asa.asa_xtra(self.input_structure)
     self.input_structure.propagateData(sum, 'A', 'ASA', xtra=True)
     residues = einput(self.input_structure, 'R')
     asa1 = []
     asa2 = []
     for residue in residues.selectChildren('H_HOH', 'ne', 'name').values():
         asa1.append(residue.xtra['ASA'])
         asa2.append(residue.xtra['STRIDE_ASA'])
     self.assertAlmostEqual(correlation(asa1, asa2)[1], 0.)
Ejemplo n.º 7
0
 def test_compare(self):
     """compares internal asa to stride."""
     self.input_file = os.path.join('data', '2E12.pdb')
     self.input_structure = PDBParser(open(self.input_file))
     try:
         asa.asa_xtra(self.input_structure, mode='stride')
     except ApplicationNotFoundError: 
         return            
     asa.asa_xtra(self.input_structure)
     self.input_structure.propagateData(sum, 'A', 'ASA', xtra=True)
     residues = einput(self.input_structure, 'R')
     asa1 = []
     asa2 = []
     for residue in  residues.selectChildren('H_HOH', 'ne', 'name').values():
         asa1.append(residue.xtra['ASA'])
         asa2.append(residue.xtra['STRIDE_ASA'])
     self.assertAlmostEqual(correlation(asa1, asa2)[1], 0.)
Ejemplo n.º 8
0
    def test_correlation(self):
        """Correlations and significance should match R's cor.test()"""
        x = [1,2,3,5]
        y = [0,0,0,0]
        z = [1,1,1,1]
        a = [2,4,6,8]
        b = [1.5, 1.4, 1.2, 1.1]
        c = [15, 10, 5, 20]

        bad = [1,2,3]   #originally gave r = 1.0000000002
        
        self.assertFloatEqual(correlation(x,x), (1, 0))
        self.assertFloatEqual(correlation(x,y), (0,1))
        self.assertFloatEqual(correlation(y,z), (0,1))
        self.assertFloatEqualAbs(correlation(x,a), (0.9827076, 0.01729), 1e-5)
        self.assertFloatEqualAbs(correlation(x,b), (-0.9621405, 0.03786), 1e-5)
        self.assertFloatEqualAbs(correlation(x,c), (0.3779645, 0.622), 1e-3)
        self.assertEqual(correlation(bad,bad), (1, 0))
Ejemplo n.º 9
0
    def test_correlation(self):
        """Correlations and significance should match R's cor.test()"""
        x = [1, 2, 3, 5]
        y = [0, 0, 0, 0]
        z = [1, 1, 1, 1]
        a = [2, 4, 6, 8]
        b = [1.5, 1.4, 1.2, 1.1]
        c = [15, 10, 5, 20]

        bad = [1, 2, 3]  #originally gave r = 1.0000000002

        self.assertFloatEqual(correlation(x, x), (1, 0))
        self.assertFloatEqual(correlation(x, y), (0, 1))
        self.assertFloatEqual(correlation(y, z), (0, 1))
        self.assertFloatEqualAbs(correlation(x, a), (0.9827076, 0.01729), 1e-5)
        self.assertFloatEqualAbs(correlation(x, b), (-0.9621405, 0.03786),
                                 1e-5)
        self.assertFloatEqualAbs(correlation(x, c), (0.3779645, 0.622), 1e-3)
        self.assertEqual(correlation(bad, bad), (1, 0))
Ejemplo n.º 10
0
def spearman_correlation(x_array, y_array, tails="two-tailed"):
    """calculate the Spearman rank correlation for x and y
    
    x_array -- a 1D NumPy array
    y_array -- a 1D NumPy array
    """

    # Convert absolute values to ranks
    x_ranks = convert_vals_to_spearman_ranks(x_array)
    y_ranks = convert_vals_to_spearman_ranks(y_array)

    # Now we get r by performing Pearson correlation
    # on the rank data.
    r, pearson_prob = correlation(x_ranks, y_ranks)

    # However, the conversion to ranks affects the prob
    # so we need the corrected version of the t statistic
    # not the generic version used by Pearson correlation
    spearman_t_prob = calc_spearman_t(r, n=len(x_array), tails=tails)

    # return r,spearman_t_prob
    return r, spearman_t_prob
Ejemplo n.º 11
0
def spearman_correlation(x_array, y_array, tails='two-tailed'):
    """calculate the Spearman rank correlation for x and y
    
    x_array -- a 1D NumPy array
    y_array -- a 1D NumPy array
    """

    #Convert absolute values to ranks
    x_ranks = convert_vals_to_spearman_ranks(x_array)
    y_ranks = convert_vals_to_spearman_ranks(y_array)

    #Now we get r by performing Pearson correlation
    #on the rank data.
    r, pearson_prob = correlation(x_ranks, y_ranks)

    #However, the conversion to ranks affects the prob
    #so we need the corrected version of the t statistic
    #not the generic version used by Pearson correlation
    spearman_t_prob =\
      calc_spearman_t(r,n=len(x_array),tails=tails)

    #return r,spearman_t_prob
    return r, spearman_t_prob
Ejemplo n.º 12
0
def run_single_correlation(OTU_abundance_values, category_values):
    """runs pearson correlation  on the designated OTU
    """
    return correlation(Numbers(category_values), Numbers(OTU_abundance_values))
Ejemplo n.º 13
0
def hommola_cospeciation_test(host_dist, par_dist, matrix, permutations):
    """Performs the cospeciation test from Hommola et al recursively over a tree.

    Takes numpy matrices of jxj host distances, ixi 'parasite' (OTU) distances, 
    and a binary ixj association matrix. 

    test data from Hommola et al MB&E 2009: 
    hdist = numpy.array([[0,3,8,8,9],[3,0,7,7,8],[8,7,0,6,7],[8,7,6,0,3],[9,8,7,3,0]])
    pdist = numpy.array([[0,5,8,8,8],[5,0,7,7,7],[8,7,0,4,4],[8,7,4,0,2],[8,7,4,2,0]])
    int = numpy.array([[1,0,0,0,0],[0,1,0,0,0],[0,0,1,0,0],[0,0,0,1,0],[0,0,0,1,1]])

    This is basically a direct translation from the R code, and not optimized
    in any way for Python.

    NOTE: the method return signature is now changed.
    For backwards compatibility purposes - 
    when this method is called, 'result' has changed to 'result[0]'
    """
    import cogent.maths.stats.test as stats
    from random import shuffle
    import numpy
    # for testing
    import math

    m = matrix.sum()

    hosts = [0] * m
    pars = [0] * m

    # Generate lists of host and symbiont edges, such that the index
    # of the lists represents an edge connecting the host to the parasite.
    s = 0
    while s < m:
        for i in range(matrix.shape[0]):
            for j in range(matrix.shape[1]):
                if matrix[i, j] == 1:
                    hosts[s] = j
                    pars[s] = i
                    s += 1

    # get a vector of pairwise distances for each interaction edge
    x = get_dist(hosts, host_dist, range(matrix.shape[1]))
    y = get_dist(pars, par_dist, range(matrix.shape[0]))

    # calculate the observed correlation coefficient for this host/symbionts
    r = stats.correlation(x, y)[0]

    # now do permutaitons. Initialize index lists of the appropriate size.
    mp = range(par_dist.shape[1])
    mh = range(host_dist.shape[1])
    below = 0

    perm_stats = []  # initialize list of shuffled correlation vals

    for i in range(permutations):
        # Generate a shuffled list of indexes for each permutation. This effectively
        # randomizes which host is associated with which symbiont, but maintains
        # the distribution of genetic distances.
        shuffle(mp)
        shuffle(mh)

        # Get pairwise distances in shuffled order
        y_p = get_dist(pars, par_dist, mp)
        x_p = get_dist(hosts, host_dist, mh)

        # calculate shuffled correlation.
        # If greater than observed value, iterate counter below.
        r_p = stats.correlation(x_p, y_p)[0]
        perm_stats.append(r_p)
        if r_p >= r:
            below += 1

    # print "Below: " + str(below)
    # print "Pemutations: " + str(permutations)

    p_val = float(below + 1) / float(permutations + 1)

    return p_val, r, perm_stats
Ejemplo n.º 14
0
def evaluate_test_dataset(observed_table, expected_table):
    """ evaluate the correlation between an observed and expected
    biom table.

    Returns data points for a scatter plot of observed v. expected values,
    and a dict of correlations keyed by method (each containing the r value,
    then the probability) 
    """
    # identify the overlapping otus that can be used to predict metagenomes
    overlapping_ids = list(
        set(observed_table.ObservationIds)
        & set(expected_table.ObservationIds))
    #print "dir(observed_table):\n",dir(observed_table)
    #print overlapping_ids

    if len(overlapping_ids) < 1:
        print "obs ids:", observed_table.ObservationIds[0:10]
        print "exp ids:", expected_table.ObservationIds[0:10]

        raise ValueError,\
         "No ids are in common  between the observed and expected tables, so no evaluations can be performed."

    # create lists to contain filtered data - we're going to need the data in
    # numpy arrays, so it makes sense to compute this way rather than filtering
    # the tables
    obs_data = []
    exp_data = []

    # build lists of filtered data
    for obs_id in overlapping_ids:
        obs_data.append(observed_table.observationData(obs_id))
        exp_data.append(expected_table.observationData(obs_id))

    #print obs_data
    #print exp_data
    flat_obs_data = ravel(array(obs_data))
    flat_exp_data = ravel(array(exp_data))
    #print flat_obs_data
    #print flat_exp_data

    #GET THE SCATTER PLOT POINTS
    scatter_data_points =\
      zip(flat_obs_data,flat_exp_data)

    correlations = {}
    if len(scatter_data_points) <= 2:
        #can't validly calc correlation
        correlations["pearson"] = (None, None)
        correlations["spearman"] = (None, None)
        return scatter_data_points, correlations
    # CALCULATE CORRELATIONS


    pearson_r,pearson_t_prob =\
      correlation(flat_obs_data,flat_exp_data)

    correlations["pearson"] = (pearson_r, pearson_t_prob)
    pearson_r2 = pearson_r**2
    correlations["pearson_r2"] = [pearson_r2]
    spearman_r,spearman_t_prob =\
      spearman_correlation(flat_obs_data,flat_exp_data)

    correlations["spearman"] = (spearman_r, spearman_t_prob)
    spearman_r2 = spearman_r**2
    correlations["spearman_r2"] = [spearman_r2]

    return scatter_data_points, correlations
Ejemplo n.º 15
0
def run_single_correlation(OTU_abundance_values, category_values):
    """runs pearson correlation  on the designated OTU
    """
    return correlation(Numbers(category_values), Numbers(OTU_abundance_values))
Ejemplo n.º 16
0
def plot_regression_line(x,y,line_color='r', axes=None, prob_axes=False, \
    axis_range=None):
    """Plots the regression line, and returns the equation.
    
    x and y are the x and y data for a single series
    line_color is a matplotlib color, will be used for the line
    axes is the name of the axes the regression will be plotted against
    prob_axes, if true, forces the axes to be between 0 and 1
    range, if not None, forces the axes to be between (xmin, xmax, ymin, ymax).
    """
    if axes is None:
        axes = gca()
    m, b = regress(x, y)
    r, significance = correlation(x,y)
    #set the a, b, and r values. a is the slope, b is the intercept.
    r_str = '%0.3g'% (r**2)
    m_str ='%0.3g' % m
    b_str = '%0.3g' % b

    #want to clip the line so it's contained entirely within the graph
    #coordinates. Basically, we need to find the values of y where x
    #is at x_min and x_max, and the values of x where y is at y_min and
    #y_max.

    #if we didn't set prob_axis or axis_range, just find empirical x and y
    if (not prob_axes) and (axis_range is None):
       x1, x2 = min(x), max(x)
       y1, y2 = m*x1 + b, m*x2 + b
       x_min, x_max = x1, x2
    else:
        if prob_axes:
            x_min, x_max = 0, 1
            y_min, y_max = 0, 1
        else: #axis range must have been set
            x_min, x_max, y_min, y_max = axis_range
        #figure out bounds for x_min and y_min
        y_at_x_min = m*x_min + b
        if y_at_x_min < y_min:  #too low: find x at y_min
            y1 = y_min
            x1 = (y_min-b)/m
        elif y_at_x_min > y_max: #too high: find x at y_max
            y1 = y_max
            x1 = (y_max-b)/m
        else:   #just right
            x1, y1 = x_min, y_at_x_min

        y_at_x_max = m*x_max + b
        if y_at_x_max < y_min:  #too low: find x at y_min
            y2 = y_min
            x2 = (y_min-b)/m
        elif y_at_x_max > y_max: #too high: find x at y_max
            y2 = y_max
            x2 = (y_max-b)/m
        else:   #just right
            x2, y2 = x_max, y_at_x_max

        #need to check that the series wasn't entirely in range
    if (x_min <= x1 <= x_max) and (x_min <= x2 <= x_max):
        axes.plot([x1,x2],[y1,y2], color=line_color, linewidth=0.5)

    if b >= 0:
        sign_str = ' + '
    else:
        sign_str = ' '
    
    equation=''.join(['y= ',m_str,'x',sign_str,b_str,'\nr$^2$=',r_str])
    return equation, line_color
Ejemplo n.º 17
0
def plot_regression_line(x,y,line_color='r', axes=None, prob_axes=False, \
    axis_range=None):
    """Plots the regression line, and returns the equation.
    
    x and y are the x and y data for a single series
    line_color is a matplotlib color, will be used for the line
    axes is the name of the axes the regression will be plotted against
    prob_axes, if true, forces the axes to be between 0 and 1
    range, if not None, forces the axes to be between (xmin, xmax, ymin, ymax).
    """
    if axes is None:
        axes = gca()
    m, b = regress(x, y)
    r, significance = correlation(x,y)
    #set the a, b, and r values. a is the slope, b is the intercept.
    r_str = '%0.3g'% (r**2)
    m_str ='%0.3g' % m
    b_str = '%0.3g' % b

    #want to clip the line so it's contained entirely within the graph
    #coordinates. Basically, we need to find the values of y where x
    #is at x_min and x_max, and the values of x where y is at y_min and
    #y_max.

    #if we didn't set prob_axis or axis_range, just find empirical x and y
    if (not prob_axes) and (axis_range is None):
       x1, x2 = min(x), max(x)
       y1, y2 = m*x1 + b, m*x2 + b
       x_min, x_max = x1, x2
    else:
        if prob_axes:
            x_min, x_max = 0, 1
            y_min, y_max = 0, 1
        else: #axis range must have been set
            x_min, x_max, y_min, y_max = axis_range
        #figure out bounds for x_min and y_min
        y_at_x_min = m*x_min + b
        if y_at_x_min < y_min:  #too low: find x at y_min
            y1 = y_min
            x1 = (y_min-b)/m
        elif y_at_x_min > y_max: #too high: find x at y_max
            y1 = y_max
            x1 = (y_max-b)/m
        else:   #just right
            x1, y1 = x_min, y_at_x_min

        y_at_x_max = m*x_max + b
        if y_at_x_max < y_min:  #too low: find x at y_min
            y2 = y_min
            x2 = (y_min-b)/m
        elif y_at_x_max > y_max: #too high: find x at y_max
            y2 = y_max
            x2 = (y_max-b)/m
        else:   #just right
            x2, y2 = x_max, y_at_x_max

        #need to check that the series wasn't entirely in range
    if (x_min <= x1 <= x_max) and (x_min <= x2 <= x_max):
        axes.plot([x1,x2],[y1,y2], color=line_color, linewidth=0.5)

    if b >= 0:
        sign_str = ' + '
    else:
        sign_str = ' '
    
    equation=''.join(['y= ',m_str,'x',sign_str,b_str,'\nr$^2$=',r_str])
    return equation, line_color