Beispiel #1
0
def pearsonr(x, y):
    """
    generalized from scipy.stats.pearsonr
    """
    # x and y should have same length.
    
    x_shape = x.shape
    if len(x_shape) > 1:
        x = x.reshape((x_shape[0],prod(x_shape[1:])))

    x = np.asarray(x)
    y = np.asarray(y)
    n = len(x)
    mx = x.mean(0)
    my = y.mean(0)
    xm, ym = x-mx, y-my

    r_num = n*np.dot(xm.T,ym)
    r_den = n*np.sqrt(np.outer(ss(xm),ss(ym,0)))

    r = (r_num / r_den)
    
    # Presumably, if r > 1, then it is only some small artifact of floating
    # point arithmetic.
    r = np.minimum(r, 1.0)
    df = n-2

    # Use a small floating point value to prevent divide-by-zero nonsense
    # fixme: TINY is probably not the right value and this is probably not
    # the way to be robust. The scheme used in spearmanr is probably better.
    TINY = 1.0e-20
    t = r*np.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
    prob = betai(0.5*df,0.5,df/(df+t*t))

    return r,prob
Beispiel #2
0
def vectorized_correlation(x, y):
    """Compute correlation coefficient between arrays with vectorization.

    Parameters
    ----------
    x, y : array-like
        Dimensions on the final axis should match, computation will be
        vectorized over preceding axes. Dimensions will be matched, or
        broadcasted, depending on shapes. In other words, passing two (m x n)
        arrays will compute the correlation between each pair of rows and
        return a vector of length n. Passing one vector of length n and one
        array of shape (m x n) will compute the correlation between the vector
        and each row in the array, also returning a vector of length n.

    Returns
    -------
    r : array
        Correlation coefficient(s).

    """
    x, y = np.asarray(x), np.asarray(y)
    mx = x.mean(axis=-1)
    my = y.mean(axis=-1)
    xm, ym = x - mx[..., None], y - my[..., None]
    r_num = np.add.reduce(xm * ym, axis=-1)
    r_den = np.sqrt(stats.ss(xm, axis=-1) * stats.ss(ym, axis=-1))
    r = r_num / r_den
    return r
Beispiel #3
0
def pearsonr(x, y):
    """
    generalized from scipy.stats.pearsonr
    """
    # x and y should have same length.

    x_shape = x.shape
    if len(x_shape) > 1:
        x = x.reshape((x_shape[0], prod(x_shape[1:])))

    x = np.asarray(x)
    y = np.asarray(y)
    n = len(x)
    mx = x.mean(0)
    my = y.mean(0)
    xm, ym = x - mx, y - my

    r_num = n * np.dot(xm.T, ym)
    r_den = n * np.sqrt(np.outer(ss(xm), ss(ym, 0)))

    r = (r_num / r_den)

    # Presumably, if r > 1, then it is only some small artifact of floating
    # point arithmetic.
    r = np.minimum(r, 1.0)
    df = n - 2

    # Use a small floating point value to prevent divide-by-zero nonsense
    # fixme: TINY is probably not the right value and this is probably not
    # the way to be robust. The scheme used in spearmanr is probably better.
    TINY = 1.0e-20
    t = r * np.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY)))
    prob = betai(0.5 * df, 0.5, df / (df + t * t))

    return r, prob
Beispiel #4
0
def fit(xdata, ydata):
    """Calculate 2D regression.

    Args:
        xdata (numpy.ndarray): 1D array of independent data [ntim],
            where ntim is the number of time points (or other independent
            points).
        ydata (numpy.ndarray): 2D array of dependent data [ntim, nspat],
            where nspat is the number of spatial points (or other dependent
            points).
     
    Returns:
        numpy.ndarray of dimension [5, nspat].  The 5 outputs are: slope, 
        intercept, Pearson's correlation coefficient, two-sided p-value for
        a hypothesis test with null hypothesis that the slope is zero, 
        standard error for the slope estimate.

    """
    # Small number to prevent divide-by-zero errors
    TINY = 1.0e-20

    # Dimensions
    ntim = xdata.shape[0]
    nspat = ydata.shape[1]

    # Add a constant (1) to the xdata to allow for intercept calculation
    xdata_plus_const = utils.add_constant(xdata)

    # Calculate parameters of the regression by solving the OLS problem
    # in its matrix form
    mat1 = np.swapaxes(
        np.dot(xdata_plus_const.T, (xdata_plus_const[np.newaxis, :, :])), 0, 1)
    mat2 = np.dot(xdata_plus_const.T, ydata)
    beta = np.linalg.solve(mat1, mat2.T)
    output = beta.T

    # Pearson correlation coefficient
    xm, ym = xdata - xdata.mean(0), ydata - ydata.mean(0)
    r_num = np.dot(xm, ym)
    r_den = np.sqrt(stats.ss(xm) * stats.ss(ym))
    pearson_r = r_num / r_den

    # Two-sided p-value for a hypothesis test whose null hypothesis is that
    # the slope is zero.
    df = ntim - 2
    tval = pearson_r * np.sqrt(df / ((1.0 - pearson_r + TINY) *
                                     (1.0 + pearson_r + TINY)))
    pval = stats.distributions.t.sf(np.abs(tval), df) * 2

    # Standard error of the slope estimate
    sst = np.sum(ym**2, 0)
    ssr = (output[0, :]**2) * np.sum(xm**2)
    se = np.sqrt((1. / df) * (sst - ssr))
    stderr = se / np.sqrt(np.sum(xm**2))

    return np.vstack([output, pearson_r, pval, stderr])
Beispiel #5
0
def fit(xdata, ydata):
    """Calculate 2D regression.

    Args:
        xdata (numpy.ndarray): 1D array of independent data [ntim],
            where ntim is the number of time points (or other independent
            points).
        ydata (numpy.ndarray): 2D array of dependent data [ntim, nspat],
            where nspat is the number of spatial points (or other dependent
            points).
     
    Returns:
        numpy.ndarray of dimension [5, nspat].  The 5 outputs are: slope, 
        intercept, Pearson's correlation coefficient, two-sided p-value for
        a hypothesis test with null hypothesis that the slope is zero, 
        standard error for the slope estimate.

    """
    # Small number to prevent divide-by-zero errors
    TINY = 1.0e-20

    # Dimensions
    ntim = xdata.shape[0]
    nspat = ydata.shape[1]

    # Add a constant (1) to the xdata to allow for intercept calculation
    xdata_plus_const = utils.add_constant(xdata)

    # Calculate parameters of the regression by solving the OLS problem
    # in its matrix form
    mat1 = np.swapaxes(np.dot(xdata_plus_const.T, (xdata_plus_const[np.newaxis, :, :])), 0, 1)
    mat2 = np.dot(xdata_plus_const.T, ydata)
    beta = np.linalg.solve(mat1, mat2.T)
    output = beta.T

    # Pearson correlation coefficient
    xm, ym = xdata - xdata.mean(0), ydata - ydata.mean(0)
    r_num = np.dot(xm, ym)
    r_den = np.sqrt(stats.ss(xm) * stats.ss(ym))
    pearson_r = r_num / r_den

    # Two-sided p-value for a hypothesis test whose null hypothesis is that
    # the slope is zero.
    df = ntim - 2
    tval = pearson_r * np.sqrt(df / ((1.0 - pearson_r + TINY) * (1.0 + pearson_r + TINY)))
    pval = stats.distributions.t.sf(np.abs(tval), df) * 2

    # Standard error of the slope estimate
    sst = np.sum(ym ** 2, 0)
    ssr = (output[0, :] ** 2) * np.sum(xm ** 2)
    se = np.sqrt((1.0 / df) * (sst - ssr))
    stderr = se / np.sqrt(np.sum(xm ** 2))

    return np.vstack([output, pearson_r, pval, stderr])
def mypearsonr(x, y):
    """
    Calculates a Pearson correlation coefficient and the p-value for testing
    non-correlation.

    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed. Like other correlation
    coefficients, this one varies between -1 and +1 with 0 implying no
    correlation. Correlations of -1 or +1 imply an exact linear
    relationship. Positive correlations imply that as x increases, so does
    y. Negative correlations imply that as x increases, y decreases.

    The p-value roughly indicates the probability of an uncorrelated system
    producing datasets that have a Pearson correlation at least as extreme
    as the one computed from these datasets. The p-values are not entirely
    reliable but are probably reasonable for datasets larger than 500 or so.

    Parameters
    ----------
    x : (N,) array_like
        Input
    y : (N,) array_like
        Input

    Returns
    -------
    (Pearson's correlation coefficient,
     2-tailed p-value)

    References
    ----------
    http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation

    """
    # x and y should have same length.
    x = np.asarray(x)
    #print x
    y = np.asarray(y)
    n = len(x)
    mx = x.mean()
    my = y.mean()
    xm, ym = x-mx, y-my
    r_num = np.add.reduce(xm * ym)
    r_den = np.sqrt(stats.ss(xm) * stats.ss(ym))
    r = r_num / r_den
    #r = max(min(r, 1.0), -1.0)
    df = n-2
    if abs(r.all()) == 1.0:
        prob = 0.0
    else:
        t_squared = r*r * (df / ((1.0 - r) * (1.0 + r)))
        prob = betai(0.5*df, 0.5, df / (df + t_squared))
    return r, prob
Beispiel #7
0
def mypearsonr(x, y):
    """
    Calculates a Pearson correlation coefficient and the p-value for testing
    non-correlation.

    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed. Like other correlation
    coefficients, this one varies between -1 and +1 with 0 implying no
    correlation. Correlations of -1 or +1 imply an exact linear
    relationship. Positive correlations imply that as x increases, so does
    y. Negative correlations imply that as x increases, y decreases.

    The p-value roughly indicates the probability of an uncorrelated system
    producing datasets that have a Pearson correlation at least as extreme
    as the one computed from these datasets. The p-values are not entirely
    reliable but are probably reasonable for datasets larger than 500 or so.

    Parameters
    ----------
    x : (N,) array_like
        Input
    y : (N,) array_like
        Input

    Returns
    -------
    (Pearson's correlation coefficient,
     2-tailed p-value)

    References
    ----------
    http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation

    """
    # x and y should have same length.
    x = np.asarray(x)
    #print x
    y = np.asarray(y)
    n = len(x)
    mx = x.mean()
    my = y.mean()
    xm, ym = x - mx, y - my
    r_num = np.add.reduce(xm * ym)
    r_den = np.sqrt(stats.ss(xm) * stats.ss(ym))
    r = r_num / r_den
    #r = max(min(r, 1.0), -1.0)
    df = n - 2
    if abs(r.all()) == 1.0:
        prob = 0.0
    else:
        t_squared = r * r * (df / ((1.0 - r) * (1.0 + r)))
        prob = betai(0.5 * df, 0.5, df / (df + t_squared))
    return r, prob
Beispiel #8
0
def calcBrownsCombinedPnanRemove(snpSet, genotypeArray):
    
    nSNPs = snpSet.nSNPs
    pValArray, adjpValArray =  snpSet.getPvalues()
    chisq = sum(-2 * np.log(pValArray))
    adjchisq = sum(-2 * np.log(adjpValArray))
    
    colsWithMissingData = np.where(np.isnan(genotypeArray))[1]
    genotypeArray = np.delete(genotypeArray, colWithMissingData, 1)
    
    ms = genotypeArray.mean(axis=1)[(slice(None,None,None),None)]
    datam = genotypeArray - ms
    datass = np.sqrt(stats.ss(datam,axis=1))

    runningSum = 0
    for i in xrange(nSNPs-1):
        temp = np.dot(datam[i:],datam[i].T)
        d = (datass[i:]*datass[i])
        rs = temp / d
        rs = np.absolute(rs)[1:]
        runningSum += 3.25 * np.sum(rs) +  .75 * np.dot(rs, rs)

    sigmaSq = 4*nSNPs+2*runningSum
    E = 2*nSNPs
    df = (2*(E*E))/sigmaSq
    runningSum = sigmaSq/(2*E)
    d = chisq/runningSum
    adjd = adjchisq/runningSum
    brownsP = stats.chi2.sf(d, df)
    adjBrownsP = stats.chi2.sf(adjd, df)

    return brownsP, adjBrownsP
Beispiel #9
0
def select_ss(dm, levels, included):

    bign = len(dm)

    distances = (dm[i][j] for i, j in above_diagonal(bign) if included(levels[i], levels[j]))

    return stats.ss(distances)
Beispiel #10
0
def f_twoway(dm, levels):

    bign = len(levels)  # number of observations
    dm = np.asarray(dm)  # distance matrix
    l = len(set(levels))  # number of levels
    a = len(set([l[0] for l in levels]))  # number of a-levels
    b = len(set([l[1] for l in levels]))  # number of b-levels
    n = bign / float(a * b)  # number of observations per level

    # sum of all distances
    ## sst = np.sum(stats.ss(r) for r in
    ##         (s[n+1:] for n,s in enumerate(dm[:-1])) )/float(bign)
    sst = stats.ss(chain(*(r[i + 1 :] for i, r in enumerate(dm)))) / float(bign)

    # same level of both a and b (error, within-group)
    ssr = select_ss(dm, levels, lambda a, b: a == b) / float(n)

    # same level of a
    sswa = select_ss(dm, levels, lambda a, b: a[0] == b[0]) / float(b * n)

    # same level of b
    sswb = select_ss(dm, levels, lambda a, b: a[1] == b[1]) / float(a * n)

    ssa = sst - sswa  # effect of a
    ssb = sst - sswb  # effect of b
    ssab = sst - ssa - ssb - ssr  # interaction sum-of-squares

    # these should each be separate functions?
    f_interaction = (ssab / float((a - 1) * (b - 1))) / (ssr / float(bign - a * b))
    f_a = (ssa / float((a - 1))) / (ssr / float(bign - a * b))
    f_b = (ssb / float((b - 1))) / (ssr / float(bign - a * b))

    return (f_interaction, f_a, f_b)
Beispiel #11
0
def f_oneway(dm, levels):
    bign = len(levels)  #number of observations
    dm = np.asarray(dm)  #distance matrix
    a = len(set(levels))  #number of levels
    n = bign / a  #number of observations per level

    assert dm.shape == (bign, bign
                        )  #check the dist matrix is square and the size
    #corresponds to the length of levels

    #total sum of squared distances
    sst = np.sum(
        stats.ss(r)
        for r in (s[n + 1:] for n, s in enumerate(dm[:-1]))) / float(bign)

    #sum of within-group squares
    #itertools.combinations(xrange(len(dm)),2)#top half of dm
    ssw = np.sum((dm[i][j]**2
                  for i, j in product(xrange(len(dm)), xrange(1, len(dm)))
                  if i < j and levels[i] == levels[j])) / float(n)

    ssa = sst - ssw

    fstat = (ssa / float(a - 1)) / (ssw / float(bign - a))
    #print (fstat,sst,ssa,ssw,a,bign,n)

    return fstat
Beispiel #12
0
def f_twoway(dm, levels):

    bign = len(levels)  #number of observations
    dm = np.asarray(dm)  #distance matrix
    l = len(set(levels))  #number of levels
    a = len(set([l[0] for l in levels]))  #number of a-levels
    b = len(set([l[1] for l in levels]))  #number of b-levels
    n = bign / float(a * b)  #number of observations per level

    #sum of all distances
    ## sst = np.sum(stats.ss(r) for r in
    ##         (s[n+1:] for n,s in enumerate(dm[:-1])) )/float(bign)
    sst = stats.ss(chain(*(r[i + 1:] for i, r in enumerate(dm)))) / float(bign)

    #same level of both a and b (error, within-group)
    ssr = select_ss(dm, levels, lambda a, b: a == b) / float(n)

    #same level of a
    sswa = select_ss(dm, levels, lambda a, b: a[0] == b[0]) / float(b * n)

    #same level of b
    sswb = select_ss(dm, levels, lambda a, b: a[1] == b[1]) / float(a * n)

    ssa = sst - sswa  #effect of a
    ssb = sst - sswb  #effect of b
    ssab = sst - ssa - ssb - ssr  #interaction sum-of-squares

    #these should each be separate functions?
    f_interaction = (ssab / float(
        (a - 1) * (b - 1))) / (ssr / float(bign - a * b))
    f_a = (ssa / float((a - 1))) / (ssr / float(bign - a * b))
    f_b = (ssb / float((b - 1))) / (ssr / float(bign - a * b))

    return (f_interaction, f_a, f_b)
Beispiel #13
0
def select_ss(dm, levels, included):

    bign = len(dm)

    distances = (dm[i][j] for i, j in above_diagonal(bign)
                 if included(levels[i], levels[j]))

    return stats.ss(distances)
Beispiel #14
0
def fastPearsonCorrelation(TC):
    N = TC.shape[1]
    corr, TCm = fastCovariance(TC)
    TCss = sqrt(ss(TCm, axis=1))
    for i in xrange(N):
        corr[i, i:] /= TCss[i:] * TCss[i]
        corr[i:, i] = corr[i, i:]
    return where(isfinite(corr), corr, 0.)
Beispiel #15
0
def pearson(self, x, y):
    data = np.vstack((x, y))
    ms = data.mean(axis=1)[(slice(None, None, None), None)]
    datam = data - ms
    datass = np.sqrt(ss(datam, axis=1))
    temp = np.dot(datam[1:], datam[0].T)
    rs = temp / (datass[1:] * datass[0])
    return rs
    """ Two-way chi-square test of independence. 
Beispiel #16
0
def pearson(x, y):
    """ Correlates row vector x with each row vector in 2D array y. """
    data = np.vstack((x, y))
    ms = data.mean(axis=1)[(slice(None, None, None), None)]
    datam = data - ms
    datass = np.sqrt(ss(datam, axis=1))
    temp = np.dot(datam[1:], datam[0].T)
    rs = temp / (datass[1:] * datass[0])
    return rs
Beispiel #17
0
def pearson(self, x, y):
  """ Correlates row vector x with each row vector in 2D array y. """
  data = np.vstack((x,y))
  ms = data.mean(axis=1)[(slice(None,None,None),None)]
  datam = data - ms
  datass = np.sqrt(ss(datam,axis=1))
  temp = np.dot(datam[1:],datam[0].T)
  rs = temp / (datass[1:]*datass[0])
  return rs
Beispiel #18
0
def pearson(self, x, y):
	data = np.vstack((x,y))
	ms = data.mean(axis=1)[(slice(None,None,None),None)]
	datam = data - ms
	datass = np.sqrt(ss(datam,axis=1))
	temp = np.dot(datam[1:],datam[0].T)
	rs = temp / (datass[1:]*datass[0])
 	return rs

	""" Two-way chi-square test of independence. 
Beispiel #19
0
def repeated_oneway(data):

    n = data.shape[0]
    k = data.shape[1]
    grand_mean = np.mean(data)
    measurement_mean = np.mean(data, axis=0)
    subject_mean = np.mean(data, axis=1)
    ssb = n * st.ss(measurement_mean - grand_mean)
    #   ssw = st.ss(data-measurement_mean)
    ssw = np.sum(st.ss(data - measurement_mean))
    sss = k * st.ss(subject_mean - grand_mean)
    sse = ssw - sss
    dfb = k - 1
    dfe = (n - 1) * (k - 1)
    msb = ssb / float(dfb)
    mse = sse / float(dfe)
    f = msb / mse
    p = st.fprob(dfb, dfe, f)
    return f, p
Beispiel #20
0
def repeated_oneway(data) :

  n = data.shape[0]
  k = data.shape[1]
  grand_mean = np.mean(data)
  measurement_mean = np.mean(data,axis=0)
  subject_mean = np.mean(data,axis=1)
  ssb = n*st.ss(measurement_mean-grand_mean)
#   ssw = st.ss(data-measurement_mean)
  ssw = np.sum(st.ss(data-measurement_mean))
  sss = k*st.ss(subject_mean-grand_mean)
  sse = ssw-sss
  dfb = k - 1
  dfe = (n-1)*(k-1)
  msb = ssb / float(dfb)
  mse = sse / float(dfe)
  f = msb / mse
  p = st.fprob(dfb,dfe,f)
  return f,p
  
Beispiel #21
0
def f_oneway(dm, levels):
    bign = len(levels)
    dm = np.asarray(dm)
    a = len(set(levels))
    n = bign/a
    assert dm.shape == (bign, bign)
    sst = np.sum(stats.ss(r) for r in (s[n+1:] for n, s in enumerate(dm[:-1])))/float(bign)
    ssw = np.sum((dm[i][j]**2 for i, j in product(xrange(len(dm)), xrange(1, len(dm))) if i < j and levels[i] == levels[j]))/float(n)
    ssa = sst - ssw
    fstat = (ssa/float(a-1))/(ssw/float(bign-a))
    return fstat
Beispiel #22
0
def pearson(x, y):
    """ Correlates row vector x with each row vector in 2D array y. 
    From neurosynth.stats.py - author: Tal Yarkoni
    """
    data = np.vstack((x, y))
    ms = data.mean(axis=1)[(slice(None, None, None), None)]
    datam = data - ms
    datass = np.sqrt(ss(datam, axis=1))
    temp = np.dot(datam[1:], datam[0].T)
    rs = temp / (datass[1:] * datass[0])
    return rs
Beispiel #23
0
def jackknife_bias_correct(pairs,confidence=None,return_all=False,
                           nan_remove=True,return_raw=False):
    '''
    Return jackknife-bias-corrected estimate from estimate-nsamples pairs
    Pairs can be either a list of tuples, or a 2 x nestimates array.
    If 'confidence' is between 0 and 1, return the mean with lower and upper
    bounds at -/+ the confidence interval.
    If 'confidence' is None, return the mean and standard error.
    If 'return_all' is True, return the mean, standard error, number of points,
    and confidence interval size.
    '''
    
    data = asarray(pairs)
    
    if nan_remove:
        data = data[isfinite(data)[:,0],:]
    
    y = data[:,0]
    x = 1./data[:,1]
    n = len(x)
    
    # Compute linear regression and standard error of intercept
    (slope,intercept,r,p,slope_se) = linregress(x,y)
    intercept_se = slope_se * sqrt(ss(x)/n)
    
    # Return mean and SE if no value is specified:
    if confidence is None:
        if return_all:
            if return_raw:
                np = data[:,1]
                max_n = max(np)
                raw_mean = mean(data[np==max_n,0])
                return intercept, intercept_se, n, raw_mean
            else:
                return intercept, intercept_se, n
        else:
            return intercept, intercept_se
    
    # Otherwise return intercept with confidence
    else:
        t_int = t._ppf((1+confidence)/2,n-2)
        intercept_int = t_int * intercept_se
        
        if return_all:
            if return_raw:
                np = data[:,1]
                max_n = max(np)
                raw_mean = mean(data[np==max_n,0])
                return intercept, intercept_se, n, intercept_int, raw_mean
            else:
                return intercept, intercept_se, n, intercept_int
        else:
            return intercept, intercept - intercept_int, intercept + intercept_int
Beispiel #24
0
def test4(TC):
    t0 = time()
    ms = TC.mean(axis=0)[(slice(None, None, None), None)]
    TCm = TC.T - ms
    TCss = sqrt(ss(TCm, axis=1))
    N = TC.shape[1]
    corr = zeros((N, N))
    for i in xrange(N):
        corr[i, i:] = dot(TCm[i:], TCm[i].T)
        corr[i, i:] /= TCss[i:] * TCss[i]
        corr[i:, i] = corr[i, i:]
    print 'Pearson      ', time() - t0
    return corr
Beispiel #25
0
def test5(TC):
    ''' Attention for TC which is modified by the function
    '''
    t0 = time()
    ms = TC.mean(axis=0)[(slice(None, None, None), None)]
    TC -= ms.T
    TCss = sqrt(ss(TC, axis=0))
    N = TC.shape[1]
    corr = zeros((N, N))
    for i in xrange(N):
        corr[i, i:] = dot(TC[:, i:].T, TC[:, i])
        corr[i, i:] /= TCss[i:] * TCss[i]
        corr[i:, i] = corr[i, i:]
    print 'Pearson      ', time() - t0
    return corr
Beispiel #26
0
def f_oneway(dm, levels):
    bign = len(levels)
    dm = np.asarray(dm)
    a = len(set(levels))
    n = bign / a
    assert dm.shape == (bign, bign)
    sst = np.sum(
        stats.ss(r)
        for r in (s[n + 1:] for n, s in enumerate(dm[:-1]))) / float(bign)
    ssw = np.sum((dm[i][j]**2
                  for i, j in product(xrange(len(dm)), xrange(1, len(dm)))
                  if i < j and levels[i] == levels[j])) / float(n)
    ssa = sst - ssw
    fstat = (ssa / float(a - 1)) / (ssw / float(bign - a))
    return fstat
Beispiel #27
0
def simpleLeastSquares(X,Y):
    """
    Compute the least-squares fit of y=ax+b
    Input: X is a list of sample x values, Y is a list of the
        corresponding Y values
    Output: A 2x1 matrix [a; b]
    """
    # Complete this function

    A=matrix([[ss(X),sum(X)],[sum(X),len(X)]],'double')
    c=matrix([[Sxy(X,Y)],[sum(Y)]],'double')
    #-------------------#
    # Calculate the values of matrices A and c
    # and return the value of p
    P = linalg.solve(A,c)
    return P
Beispiel #28
0
def PearsonCorrelation(TC):
    #TODO change with coravriance, see Plot.py
    ''' Return the Pearson Correlation. The calculus is done for HALF of the matrix and duplicate for symetry.
    TC need to have a shape like [time, nodes]
    Attention for TC which is modified by the function
    '''
    from scipy.stats import ss
    from pylab import sqrt as np_sqrt, dot as np_dot
    TC -= TC.mean(axis=0)[(slice(None, None, None), None)].T
    TCss = np_sqrt(ss(TC, axis=0))
    N = TC.shape[1]
    corr = zeros((N, N))
    for i in xrange(N):
        corr[i, i:] = np_dot(TC[:, i:].T, TC[:, i])
        corr[i, i:] /= TCss[i:] * TCss[i]
        corr[i:, i] = corr[i, i:]
    return corr
Beispiel #29
0
def write_correlation_matrix(in_file, mask_file, out_file):
    import nibabel as nb
    import numpy as np
    from scipy.stats import ss
    import os
    mask_nii = nb.load(mask_file)
    data_nii = nb.load(in_file)

    data = data_nii.get_data()[mask_nii.get_data() > 0, :]
    print(data.shape[0] * (data.shape[0] - 1) / 2)
    return

    corr_matrix = np.memmap(out_file,
                            dtype='int16',
                            mode='w+',
                            shape=(data.shape[0] * (data.shape[0] - 1) / 2))

    counter = 0
    ms = data.mean(axis=1)[(slice(None, None, None), None)]
    datam = data - ms
    datass = np.sqrt(ss(datam, axis=1))

    status = 0
    for i in xrange(0, data.shape[0]):
        temp = np.dot(datam[i + 1:], datam[i].T)
        rs = temp / (datass[i + 1:] * datass[i])
        corr_matrix[counter:counter + len(rs)] = rs * 10000
        counter += len(rs)

        if (counter / float(len(corr_matrix))) * 100 - status > 1:
            print "%d" % (counter / float(len(corr_matrix)) * 100)
            status = (counter / float(len(corr_matrix))) * 100


#    counter = 0
#    for i in range(data.shape[0]):
#        for j in range(i+1, data.shape[0]):
#            print "%g"%(counter/float(data.shape[0]*(data.shape[0]-1)/2))
#            r,_ = pearsonr(data[i,:], data[j,:])
#            corr_matrix[counter] = r
#            counter += 1
    del corr_matrix
    return os.path.abspath(out_file)
Beispiel #30
0
    def loglike(self, endog, mu, scale=1.0):
        """
        The log-likelihood in terms of the fitted mean response.

        Parameters
        ----------
        endog : array-like
            Endogenous response variable
        mu : array-like
            Fitted mean response variable
        scale : float, optional
            Scales the loglikelihood function. The default is 1.

        Returns
        -------
        llf : float
            The value of the loglikelihood function evaluated at
            (endog,mu,scale) as defined below.

        Notes
        -----
        If the link is the identity link function then the
        loglikelihood function is the same as the classical OLS model.
        llf = -(nobs/2)*(log(SSR) + (1 + log(2*pi/nobs)))
        where SSR = sum((endog-link^(-1)(mu))**2)

        If the links is not the identity link then the loglikelihood
        function is defined as
        llf = sum((`endog`*`mu`-`mu`**2/2)/`scale` - `endog`**2/(2*`scale`) - \
            (1/2.)*log(2*pi*`scale`))
        """
        if isinstance(self.link, L.Power) and self.link.power == 1:
            # This is just the loglikelihood for classical OLS
            nobs2 = endog.shape[0] / 2.0
            SSR = ss(endog - self.fitted(mu))
            llf = -np.log(SSR) * nobs2
            llf -= (1 + np.log(np.pi / nobs2)) * nobs2
            return llf
        else:
            # Return the loglikelihood for Gaussian GLM
            return np.sum(
                (endog * mu - mu ** 2 / 2) / scale - endog ** 2 / (2 * scale) - 0.5 * np.log(2 * np.pi * scale)
            )
    def loglike(self, Y, mu, scale=1.):
        """
        Loglikelihood function for Gaussian exponential family distribution.

        Parameters
        ----------
        Y : array-like
            Endogenous response variable
        mu : array-like
            Fitted mean response variable
        scale : float, optional
            Scales the loglikelihood function. The default is 1.

        Returns
        -------
        llf : float
            The value of the loglikelihood function evaluated at (Y,mu,scale)
            as defined below.

        Formulas
        --------
        If the link is the identity link function then the
        loglikelihood function is the same as the classical OLS model.
        llf = -(nobs/2)*(log(SSR) + (1 + log(2*pi/nobs)))
        where SSR = sum((Y-link^(-1)(mu))**2)

        If the links is not the identity link then the loglikelihood
        function is defined as
        llf = sum((`Y`*`mu`-`mu`**2/2)/`scale` - `Y`**2/(2*`scale`) - \
            (1/2.)*log(2*pi*`scale`))
        """
        if isinstance(self.link, L.Power) and self.link.power == 1:
        # This is just the loglikelihood for classical OLS
            nobs2 = Y.shape[0]/2.
            SSR = ss(Y-self.fitted(mu))
            llf = -np.log(SSR) * nobs2
            llf -= (1+np.log(np.pi/nobs2))*nobs2
            return llf
        else:
        # Return the loglikelihood for Gaussian GLM
            return np.sum((Y*mu-mu**2/2)/scale-Y**2/(2*scale)-\
                    .5*np.log(2*np.pi*scale))
Beispiel #32
0
    def loglike(self, Y, mu, scale=1.):
        """
        Loglikelihood function for Gaussian exponential family distribution.

        Parameters
        ----------
        Y : array-like
            Endogenous response variable
        mu : array-like
            Fitted mean response variable
        scale : float, optional
            Scales the loglikelihood function. The default is 1.

        Returns
        -------
        llf : float
            The value of the loglikelihood function evaluated at (Y,mu,scale)
            as defined below.

        Notes
        -----
        If the link is the identity link function then the
        loglikelihood function is the same as the classical OLS model.
        llf = -(nobs/2)*(log(SSR) + (1 + log(2*pi/nobs)))
        where SSR = sum((Y-link^(-1)(mu))**2)

        If the links is not the identity link then the loglikelihood
        function is defined as
        llf = sum((`Y`*`mu`-`mu`**2/2)/`scale` - `Y`**2/(2*`scale`) - \
            (1/2.)*log(2*pi*`scale`))
        """
        if isinstance(self.link, L.Power) and self.link.power == 1:
            # This is just the loglikelihood for classical OLS
            nobs2 = Y.shape[0] / 2.
            SSR = ss(Y - self.fitted(mu))
            llf = -np.log(SSR) * nobs2
            llf -= (1 + np.log(np.pi / nobs2)) * nobs2
            return llf
        else:
            # Return the loglikelihood for Gaussian GLM
            return np.sum((Y*mu-mu**2/2)/scale-Y**2/(2*scale)-\
                    .5*np.log(2*np.pi*scale))
Beispiel #33
0
def f_oneway(dm, levels):
    bign = len(levels)  # number of observations
    dm = np.asarray(dm)  # distance matrix
    a = len(set(levels))  # number of levels
    n = bign / a  # number of observations per level

    assert dm.shape == (bign, bign)  # check the dist matrix is square and the size
    # corresponds to the length of levels

    # total sum of squared distances
    sst = np.sum(stats.ss(r) for r in (s[n + 1 :] for n, s in enumerate(dm[:-1]))) / float(bign)

    # sum of within-group squares
    # itertools.combinations(xrange(len(dm)),2)#top half of dm
    ssw = np.sum(
        (dm[i][j] ** 2 for i, j in product(xrange(len(dm)), xrange(1, len(dm))) if i < j and levels[i] == levels[j])
    ) / float(n)

    ssa = sst - ssw

    fstat = (ssa / float(a - 1)) / (ssw / float(bign - a))
    # print (fstat,sst,ssa,ssw,a,bign,n)

    return fstat
Beispiel #34
0
def descstats(data, cols=None, axis=0):
    '''
    Prints descriptive statistics for one or multiple variables.

    Parameters
    ------------
    data: numpy array
        `x` is the data

    v: list, optional
        A list of the column number or field names (for a recarray) of variables.
        Default is all columns.

    axis: 1 or 0
        axis order of data.  Default is 0 for column-ordered data.

    Examples
    --------
    >>> descstats(data.exog,v=['x_1','x_2','x_3'])
    '''

    x = np.array(data)  # or rather, the data we're interested in
    if cols is None:
        #       if isinstance(x, np.recarray):
        #            cols = np.array(len(x.dtype.names))
        if not isinstance(x, np.recarray) and x.ndim == 1:
            x = x[:, None]

    if x.shape[1] == 1:
        desc = '''
    ---------------------------------------------
    Univariate Descriptive Statistics
    ---------------------------------------------

    Var. Name   %(name)12s
    ----------
    Obs.          %(nobs)22i  Range                  %(range)22s
    Sum of Wts.   %(sum)22s  Coeff. of Variation     %(coeffvar)22.4g
    Mode          %(mode)22.4g  Skewness                %(skewness)22.4g
    Repeats       %(nmode)22i  Kurtosis                %(kurtosis)22.4g
    Mean          %(mean)22.4g  Uncorrected SS          %(uss)22.4g
    Median        %(median)22.4g  Corrected SS            %(ss)22.4g
    Variance      %(variance)22.4g  Sum Observations        %(sobs)22.4g
    Std. Dev.     %(stddev)22.4g
    ''' % {'name': cols, 'sum': 'N/A', 'nobs': len(x), 'mode': \
    stats.mode(x)[0][0], 'nmode': stats.mode(x)[1][0], \
    'mean': x.mean(), 'median': np.median(x), 'range': \
    '('+str(x.min())+', '+str(x.max())+')', 'variance': \
    x.var(), 'stddev': x.std(), 'coeffvar': \
    stats.variation(x), 'skewness': stats.skew(x), \
    'kurtosis': stats.kurtosis(x), 'uss': stats.ss(x),\
    'ss': stats.ss(x-x.mean()), 'sobs': np.sum(x)}

        #    ''' % {'name': cols[0], 'sum': 'N/A', 'nobs': len(x[cols[0]]), 'mode': \
        #    stats.mode(x[cols[0]])[0][0], 'nmode': stats.mode(x[cols[0]])[1][0], \
        #    'mean': x[cols[0]].mean(), 'median': np.median(x[cols[0]]), 'range': \
        #    '('+str(x[cols[0]].min())+', '+str(x[cols[0]].max())+')', 'variance': \
        #    x[cols[0]].var(), 'stddev': x[cols[0]].std(), 'coeffvar': \
        #    stats.variation(x[cols[0]]), 'skewness': stats.skew(x[cols[0]]), \
        #    'kurtosis': stats.kurtosis(x[cols[0]]), 'uss': stats.ss(x[cols[0]]),\
        #    'ss': stats.ss(x[cols[0]]-x[cols[0]].mean()), 'sobs': np.sum(x[cols[0]])}

        desc += '''

    Percentiles
    -------------
    1  %%          %12.4g
    5  %%          %12.4g
    10 %%          %12.4g
    25 %%          %12.4g

    50 %%          %12.4g

    75 %%          %12.4g
    90 %%          %12.4g
    95 %%          %12.4g
    99 %%          %12.4g
    ''' % tuple([
            stats.scoreatpercentile(x, per)
            for per in (1, 5, 10, 25, 50, 75, 90, 95, 99)
        ])
        t, p_t = stats.ttest_1samp(x, 0)
        M, p_M = sign_test(x)
        S, p_S = stats.wilcoxon(np.squeeze(x))

        desc += '''

    Tests of Location (H0: Mu0=0)
    -----------------------------
    Test                Statistic       Two-tailed probability
    -----------------+-----------------------------------------
    Student's t      |  t %7.5f   Pr > |t|   <%.4f
    Sign             |  M %8.2f   Pr >= |M|  <%.4f
    Signed Rank      |  S %8.2f   Pr >= |S|  <%.4f

    ''' % (t, p_t, M, p_M, S, p_S)
# Should this be part of a 'descstats'
# in any event these should be split up, so that they can be called
# individually and only returned together if someone calls summary
# or something of the sort

    elif x.shape[1] > 1:
        desc ='''
    Var. Name   |     Obs.        Mean    Std. Dev.           Range
    ------------+--------------------------------------------------------'''+\
            os.linesep

        # for recarrays with columns passed as names
        #        if isinstance(cols[0],str):
        #            for var in cols:
        #                desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \
        #%(range)20s" %  {'name': var, 'obs': len(x[var]), 'mean': x[var].mean(),
        #        'stddev': x[var].std(), 'range': '('+str(x[var].min())+', '\
        #                +str(x[var].max())+')'+os.linesep}
        #        else:
        for var in range(x.shape[1]):
            desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \
%(range)20s"         % {'name': var, 'obs': len(x[:,var]), 'mean': x[:,var].mean(),
            'stddev': x[:,var].std(), 'range': '('+str(x[:,var].min())+', '+\
            str(x[:,var].max())+')'+os.linesep}
    else:
        raise ValueError, "data not understood"

    return desc
Beispiel #35
0
def statistics(stat, infile, outfile, previous_p3_mosaic_dir=None):
    """Calculate the statistics
    """

    # Open the original file
    dataset = gdal.Open(infile, gdal.GA_ReadOnly)
    num_layers = dataset.RasterCount
    # get the projection information
    no_data_value, xsize, ysize, geo_trans, projection, data_type = get_geo_info(
        infile)

    # get the numpy 3rd dimension array stack of the bands of image
    raster_stack = bands2layerstack(infile)

    # define the default output type format
    output_type = gdal.GDT_Float32

    # call built in numpy statistical functions, with a specified axis. if
    # axis=2 means it will calculate along the 'depth' axis, per pixel.
    # with the return being n by m, the shape of each band.
    #
    # Calculate the median statistical
    if stat == 'median':
        new_array = np.nanmedian(raster_stack, axis=2)
        output_type = gdal.GDT_UInt16
    # Calculate the mean statistical
    if stat == 'mean':
        new_array = np.nanmean(raster_stack, axis=2)
        output_type = gdal.GDT_UInt16
    # Calculate the standard deviation
    if stat == 'std':
        new_array = np.nanstd(raster_stack, axis=2)
    # Calculate the valid data
    if stat == 'valid_data':
        # calculate the number of valid data used in statistics products in percentage (0-100%),
        # this count the valid data (no nans) across the layers (time axis)
        new_array = (num_layers -
                     np.isnan(raster_stack).sum(axis=2)) * 100 / num_layers
    # Calculate the signal-to-noise ratio
    if stat == 'snr':
        # this signal-to-noise ratio defined as the mean divided by the standard deviation.
        m = np.nanmean(raster_stack, axis=2)
        sd = np.nanstd(raster_stack, axis=2, ddof=0)
        new_array = np.where(sd == 0, 0, m / sd)
    # Calculate the coefficient of variation
    if stat == 'coeff_var':
        # the ratio of the biased standard deviation to the mean
        new_array = variation(raster_stack, axis=2, nan_policy='omit')
    # Calculate the Pearson's correlation coefficient
    if stat == 'pearson_corr':
        # https://github.com/scipy/scipy/blob/v0.14.0/scipy/stats/stats.py#L2392
        # get array of the previous mean file
        previous_dataset_file = os.path.join(
            previous_p3_mosaic_dir,
            os.path.basename(outfile).split('_pearson_corr.tif')[0] + '.tif')

        # get the numpy 3rd dimension array stack of the bands of image
        previous_raster_stack = bands2layerstack(previous_dataset_file)

        # raster_stack and previous_raster_stack should have same length in all axis
        if raster_stack.shape != previous_raster_stack.shape:
            z_rs = raster_stack.shape[2]
            z_prs = previous_raster_stack.shape[2]

            if z_rs > z_prs:
                raster_stack = np.delete(raster_stack, np.s_[z_prs - z_rs:], 2)
            if z_prs > z_rs:
                previous_raster_stack = np.delete(previous_raster_stack,
                                                  np.s_[z_rs - z_prs:], 2)

        # propagate the nan values across the pair values in the same position for the
        # two raster in both directions
        mask1 = np.isnan(raster_stack)
        mask2 = np.isnan(previous_raster_stack)
        combined_mask = mask1 | mask2
        raster_stack = np.where(combined_mask, np.nan, raster_stack)
        previous_raster_stack = np.where(combined_mask, np.nan,
                                         previous_raster_stack)
        del mask1, mask2, combined_mask

        mean_rs = np.nanmean(raster_stack, axis=2, keepdims=True)
        mean_prs = np.nanmean(previous_raster_stack, axis=2, keepdims=True)
        m_rs = np.nan_to_num(raster_stack - mean_rs)
        m_prs = np.nan_to_num(previous_raster_stack - mean_prs)
        r_num = np.add.reduce(m_rs * m_prs, axis=2)
        r_den = np.sqrt(ss(m_rs, axis=2) * ss(m_prs, axis=2))
        r = r_num / r_den

        # return the r coefficient -1 to 1
        new_array = r

    #### create the output geo tif
    # Set up the GTiff driver
    driver = gdal.GetDriverByName('GTiff')

    new_dataset = driver.Create(outfile, xsize, ysize, 1, output_type,
                                ["COMPRESS=LZW", "PREDICTOR=2", "TILED=YES"])
    # the '1' is for band 1
    new_dataset.SetGeoTransform(geo_trans)
    new_dataset.SetProjection(projection.ExportToWkt())
    # Write the array
    new_dataset.GetRasterBand(1).WriteArray(new_array)
    new_dataset.GetRasterBand(1).SetNoDataValue(np.nan)
def descstats(data, cols=None, axis=0):
    '''
    Prints descriptive statistics for one or multiple variables.

    Parameters
    ------------
    data: numpy array
        `x` is the data

    v: list, optional
        A list of the column number or field names (for a recarray) of variables.
        Default is all columns.

    axis: 1 or 0
        axis order of data.  Default is 0 for column-ordered data.

    Example
    ----------simple
    >>>
    decstats(data.exog,v=['x_1','x_2','x_3'])

    '''

    x = np.array(data)  # or rather, the data we're interested in
    if cols is None:
#       if isinstance(x, np.recarray):
#            cols = np.array(len(x.dtype.names))
        if not isinstance(x, np.recarray) and x.ndim == 1:
            x = x[:,None]

    if x.shape[1] == 1:
        desc = '''
    ---------------------------------------------
    Univariate Descriptive Statistics
    ---------------------------------------------

    Var. Name   %(name)12s
    ----------
    Obs.          %(nobs)22i  Range                  %(range)22s
    Sum of Wts.   %(sum)22s  Coeff. of Variation     %(coeffvar)22.4g
    Mode          %(mode)22.4g  Skewness                %(skewness)22.4g
    Repeats       %(nmode)22i  Kurtosis                %(kurtosis)22.4g
    Mean          %(mean)22.4g  Uncorrected SS          %(uss)22.4g
    Median        %(median)22.4g  Corrected SS            %(ss)22.4g
    Variance      %(variance)22.4g  Sum Observations        %(sobs)22.4g
    Std. Dev.     %(stddev)22.4g
    ''' % {'name': cols, 'sum': 'N/A', 'nobs': len(x), 'mode': \
    stats.mode(x)[0][0], 'nmode': stats.mode(x)[1][0], \
    'mean': x.mean(), 'median': np.median(x), 'range': \
    '('+str(x.min())+', '+str(x.max())+')', 'variance': \
    x.var(), 'stddev': x.std(), 'coeffvar': \
    stats.variation(x), 'skewness': stats.skew(x), \
    'kurtosis': stats.kurtosis(x), 'uss': stats.ss(x),\
    'ss': stats.ss(x-x.mean()), 'sobs': np.sum(x)}

#    ''' % {'name': cols[0], 'sum': 'N/A', 'nobs': len(x[cols[0]]), 'mode': \
#    stats.mode(x[cols[0]])[0][0], 'nmode': stats.mode(x[cols[0]])[1][0], \
#    'mean': x[cols[0]].mean(), 'median': np.median(x[cols[0]]), 'range': \
#    '('+str(x[cols[0]].min())+', '+str(x[cols[0]].max())+')', 'variance': \
#    x[cols[0]].var(), 'stddev': x[cols[0]].std(), 'coeffvar': \
#    stats.variation(x[cols[0]]), 'skewness': stats.skew(x[cols[0]]), \
#    'kurtosis': stats.kurtosis(x[cols[0]]), 'uss': stats.ss(x[cols[0]]),\
#    'ss': stats.ss(x[cols[0]]-x[cols[0]].mean()), 'sobs': np.sum(x[cols[0]])}

        desc+= '''

    Percentiles
    -------------
    1  %%          %12.4g
    5  %%          %12.4g
    10 %%          %12.4g
    25 %%          %12.4g

    50 %%          %12.4g

    75 %%          %12.4g
    90 %%          %12.4g
    95 %%          %12.4g
    99 %%          %12.4g
    ''' % tuple([stats.scoreatpercentile(x,per) for per in (1,5,10,25,
                50,75,90,95,99)])
        t,p_t=stats.ttest_1samp(x,0)
        M,p_M=sign_test(x)
        S,p_S=stats.wilcoxon(np.squeeze(x))

        desc+= '''

    Tests of Location (H0: Mu0=0)
    -----------------------------
    Test                Statistic       Two-tailed probability
    -----------------+-----------------------------------------
    Student's t      |  t %7.5f   Pr > |t|   <%.4f
    Sign             |  M %8.2f   Pr >= |M|  <%.4f
    Signed Rank      |  S %8.2f   Pr >= |S|  <%.4f

    ''' % (t,p_t,M,p_M,S,p_S)
# Should this be part of a 'descstats'
# in any event these should be split up, so that they can be called
# individually and only returned together if someone calls summary
# or something of the sort

    elif x.shape[1] > 1:
        desc ='''
    Var. Name   |     Obs.        Mean    Std. Dev.           Range
    ------------+--------------------------------------------------------'''+\
            os.linesep

# for recarrays with columns passed as names
#        if isinstance(cols[0],str):
#            for var in cols:
#                desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \
#%(range)20s" %  {'name': var, 'obs': len(x[var]), 'mean': x[var].mean(),
#        'stddev': x[var].std(), 'range': '('+str(x[var].min())+', '\
#                +str(x[var].max())+')'+os.linesep}
#        else:
        for var in range(x.shape[1]):
                desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \
%(range)20s" % {'name': var, 'obs': len(x[:,var]), 'mean': x[:,var].mean(),
                'stddev': x[:,var].std(), 'range': '('+str(x[:,var].min())+', '+\
                str(x[:,var].max())+')'+os.linesep}
    else:
        raise ValueError, "data not understood"

    return desc