コード例 #1
ファイル: stats.py プロジェクト: ardila/yamutils
def pearsonr(x, y):
    generalized from scipy.stats.pearsonr
    # x and y should have same length.
    x_shape = x.shape
    if len(x_shape) > 1:
        x = x.reshape((x_shape[0],prod(x_shape[1:])))

    x = np.asarray(x)
    y = np.asarray(y)
    n = len(x)
    mx = x.mean(0)
    my = y.mean(0)
    xm, ym = x-mx, y-my

    r_num = n*np.dot(xm.T,ym)
    r_den = n*np.sqrt(np.outer(ss(xm),ss(ym,0)))

    r = (r_num / r_den)
    # Presumably, if r > 1, then it is only some small artifact of floating
    # point arithmetic.
    r = np.minimum(r, 1.0)
    df = n-2

    # Use a small floating point value to prevent divide-by-zero nonsense
    # fixme: TINY is probably not the right value and this is probably not
    # the way to be robust. The scheme used in spearmanr is probably better.
    TINY = 1.0e-20
    t = r*np.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
    prob = betai(0.5*df,0.5,df/(df+t*t))

    return r,prob
コード例 #2
def vectorized_correlation(x, y):
    """Compute correlation coefficient between arrays with vectorization.

    x, y : array-like
        Dimensions on the final axis should match, computation will be
        vectorized over preceding axes. Dimensions will be matched, or
        broadcasted, depending on shapes. In other words, passing two (m x n)
        arrays will compute the correlation between each pair of rows and
        return a vector of length n. Passing one vector of length n and one
        array of shape (m x n) will compute the correlation between the vector
        and each row in the array, also returning a vector of length n.

    r : array
        Correlation coefficient(s).

    x, y = np.asarray(x), np.asarray(y)
    mx = x.mean(axis=-1)
    my = y.mean(axis=-1)
    xm, ym = x - mx[..., None], y - my[..., None]
    r_num = np.add.reduce(xm * ym, axis=-1)
    r_den = np.sqrt(stats.ss(xm, axis=-1) * stats.ss(ym, axis=-1))
    r = r_num / r_den
    return r
コード例 #3
ファイル: stats.py プロジェクト: cadieu/yamutils
def pearsonr(x, y):
    generalized from scipy.stats.pearsonr
    # x and y should have same length.

    x_shape = x.shape
    if len(x_shape) > 1:
        x = x.reshape((x_shape[0], prod(x_shape[1:])))

    x = np.asarray(x)
    y = np.asarray(y)
    n = len(x)
    mx = x.mean(0)
    my = y.mean(0)
    xm, ym = x - mx, y - my

    r_num = n * np.dot(xm.T, ym)
    r_den = n * np.sqrt(np.outer(ss(xm), ss(ym, 0)))

    r = (r_num / r_den)

    # Presumably, if r > 1, then it is only some small artifact of floating
    # point arithmetic.
    r = np.minimum(r, 1.0)
    df = n - 2

    # Use a small floating point value to prevent divide-by-zero nonsense
    # fixme: TINY is probably not the right value and this is probably not
    # the way to be robust. The scheme used in spearmanr is probably better.
    TINY = 1.0e-20
    t = r * np.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY)))
    prob = betai(0.5 * df, 0.5, df / (df + t * t))

    return r, prob
コード例 #4
ファイル: ols.py プロジェクト: twktheainur/fastreg
def fit(xdata, ydata):
    """Calculate 2D regression.

        xdata (numpy.ndarray): 1D array of independent data [ntim],
            where ntim is the number of time points (or other independent
        ydata (numpy.ndarray): 2D array of dependent data [ntim, nspat],
            where nspat is the number of spatial points (or other dependent
        numpy.ndarray of dimension [5, nspat].  The 5 outputs are: slope, 
        intercept, Pearson's correlation coefficient, two-sided p-value for
        a hypothesis test with null hypothesis that the slope is zero, 
        standard error for the slope estimate.

    # Small number to prevent divide-by-zero errors
    TINY = 1.0e-20

    # Dimensions
    ntim = xdata.shape[0]
    nspat = ydata.shape[1]

    # Add a constant (1) to the xdata to allow for intercept calculation
    xdata_plus_const = utils.add_constant(xdata)

    # Calculate parameters of the regression by solving the OLS problem
    # in its matrix form
    mat1 = np.swapaxes(
        np.dot(xdata_plus_const.T, (xdata_plus_const[np.newaxis, :, :])), 0, 1)
    mat2 = np.dot(xdata_plus_const.T, ydata)
    beta = np.linalg.solve(mat1, mat2.T)
    output = beta.T

    # Pearson correlation coefficient
    xm, ym = xdata - xdata.mean(0), ydata - ydata.mean(0)
    r_num = np.dot(xm, ym)
    r_den = np.sqrt(stats.ss(xm) * stats.ss(ym))
    pearson_r = r_num / r_den

    # Two-sided p-value for a hypothesis test whose null hypothesis is that
    # the slope is zero.
    df = ntim - 2
    tval = pearson_r * np.sqrt(df / ((1.0 - pearson_r + TINY) *
                                     (1.0 + pearson_r + TINY)))
    pval = stats.distributions.t.sf(np.abs(tval), df) * 2

    # Standard error of the slope estimate
    sst = np.sum(ym**2, 0)
    ssr = (output[0, :]**2) * np.sum(xm**2)
    se = np.sqrt((1. / df) * (sst - ssr))
    stderr = se / np.sqrt(np.sum(xm**2))

    return np.vstack([output, pearson_r, pval, stderr])
コード例 #5
ファイル: ols.py プロジェクト: ajferraro/fastreg
def fit(xdata, ydata):
    """Calculate 2D regression.

        xdata (numpy.ndarray): 1D array of independent data [ntim],
            where ntim is the number of time points (or other independent
        ydata (numpy.ndarray): 2D array of dependent data [ntim, nspat],
            where nspat is the number of spatial points (or other dependent
        numpy.ndarray of dimension [5, nspat].  The 5 outputs are: slope, 
        intercept, Pearson's correlation coefficient, two-sided p-value for
        a hypothesis test with null hypothesis that the slope is zero, 
        standard error for the slope estimate.

    # Small number to prevent divide-by-zero errors
    TINY = 1.0e-20

    # Dimensions
    ntim = xdata.shape[0]
    nspat = ydata.shape[1]

    # Add a constant (1) to the xdata to allow for intercept calculation
    xdata_plus_const = utils.add_constant(xdata)

    # Calculate parameters of the regression by solving the OLS problem
    # in its matrix form
    mat1 = np.swapaxes(np.dot(xdata_plus_const.T, (xdata_plus_const[np.newaxis, :, :])), 0, 1)
    mat2 = np.dot(xdata_plus_const.T, ydata)
    beta = np.linalg.solve(mat1, mat2.T)
    output = beta.T

    # Pearson correlation coefficient
    xm, ym = xdata - xdata.mean(0), ydata - ydata.mean(0)
    r_num = np.dot(xm, ym)
    r_den = np.sqrt(stats.ss(xm) * stats.ss(ym))
    pearson_r = r_num / r_den

    # Two-sided p-value for a hypothesis test whose null hypothesis is that
    # the slope is zero.
    df = ntim - 2
    tval = pearson_r * np.sqrt(df / ((1.0 - pearson_r + TINY) * (1.0 + pearson_r + TINY)))
    pval = stats.distributions.t.sf(np.abs(tval), df) * 2

    # Standard error of the slope estimate
    sst = np.sum(ym ** 2, 0)
    ssr = (output[0, :] ** 2) * np.sum(xm ** 2)
    se = np.sqrt((1.0 / df) * (sst - ssr))
    stderr = se / np.sqrt(np.sum(xm ** 2))

    return np.vstack([output, pearson_r, pval, stderr])
コード例 #6
def mypearsonr(x, y):
    Calculates a Pearson correlation coefficient and the p-value for testing

    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed. Like other correlation
    coefficients, this one varies between -1 and +1 with 0 implying no
    correlation. Correlations of -1 or +1 imply an exact linear
    relationship. Positive correlations imply that as x increases, so does
    y. Negative correlations imply that as x increases, y decreases.

    The p-value roughly indicates the probability of an uncorrelated system
    producing datasets that have a Pearson correlation at least as extreme
    as the one computed from these datasets. The p-values are not entirely
    reliable but are probably reasonable for datasets larger than 500 or so.

    x : (N,) array_like
    y : (N,) array_like

    (Pearson's correlation coefficient,
     2-tailed p-value)


    # x and y should have same length.
    x = np.asarray(x)
    #print x
    y = np.asarray(y)
    n = len(x)
    mx = x.mean()
    my = y.mean()
    xm, ym = x-mx, y-my
    r_num = np.add.reduce(xm * ym)
    r_den = np.sqrt(stats.ss(xm) * stats.ss(ym))
    r = r_num / r_den
    #r = max(min(r, 1.0), -1.0)
    df = n-2
    if abs(r.all()) == 1.0:
        prob = 0.0
        t_squared = r*r * (df / ((1.0 - r) * (1.0 + r)))
        prob = betai(0.5*df, 0.5, df / (df + t_squared))
    return r, prob
コード例 #7
ファイル: kmeans.py プロジェクト: lkhqz/EcocampHackathon2014
def mypearsonr(x, y):
    Calculates a Pearson correlation coefficient and the p-value for testing

    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed. Like other correlation
    coefficients, this one varies between -1 and +1 with 0 implying no
    correlation. Correlations of -1 or +1 imply an exact linear
    relationship. Positive correlations imply that as x increases, so does
    y. Negative correlations imply that as x increases, y decreases.

    The p-value roughly indicates the probability of an uncorrelated system
    producing datasets that have a Pearson correlation at least as extreme
    as the one computed from these datasets. The p-values are not entirely
    reliable but are probably reasonable for datasets larger than 500 or so.

    x : (N,) array_like
    y : (N,) array_like

    (Pearson's correlation coefficient,
     2-tailed p-value)


    # x and y should have same length.
    x = np.asarray(x)
    #print x
    y = np.asarray(y)
    n = len(x)
    mx = x.mean()
    my = y.mean()
    xm, ym = x - mx, y - my
    r_num = np.add.reduce(xm * ym)
    r_den = np.sqrt(stats.ss(xm) * stats.ss(ym))
    r = r_num / r_den
    #r = max(min(r, 1.0), -1.0)
    df = n - 2
    if abs(r.all()) == 1.0:
        prob = 0.0
        t_squared = r * r * (df / ((1.0 - r) * (1.0 + r)))
        prob = betai(0.5 * df, 0.5, df / (df + t_squared))
    return r, prob
コード例 #8
ファイル: BrownsMethods.py プロジェクト: DavyK/pCombine
def calcBrownsCombinedPnanRemove(snpSet, genotypeArray):
    nSNPs = snpSet.nSNPs
    pValArray, adjpValArray =  snpSet.getPvalues()
    chisq = sum(-2 * np.log(pValArray))
    adjchisq = sum(-2 * np.log(adjpValArray))
    colsWithMissingData = np.where(np.isnan(genotypeArray))[1]
    genotypeArray = np.delete(genotypeArray, colWithMissingData, 1)
    ms = genotypeArray.mean(axis=1)[(slice(None,None,None),None)]
    datam = genotypeArray - ms
    datass = np.sqrt(stats.ss(datam,axis=1))

    runningSum = 0
    for i in xrange(nSNPs-1):
        temp = np.dot(datam[i:],datam[i].T)
        d = (datass[i:]*datass[i])
        rs = temp / d
        rs = np.absolute(rs)[1:]
        runningSum += 3.25 * np.sum(rs) +  .75 * np.dot(rs, rs)

    sigmaSq = 4*nSNPs+2*runningSum
    E = 2*nSNPs
    df = (2*(E*E))/sigmaSq
    runningSum = sigmaSq/(2*E)
    d = chisq/runningSum
    adjd = adjchisq/runningSum
    brownsP = stats.chi2.sf(d, df)
    adjBrownsP = stats.chi2.sf(adjd, df)

    return brownsP, adjBrownsP
コード例 #9
ファイル: permanova.py プロジェクト: theJohnnyBrown/permanova
def select_ss(dm, levels, included):

    bign = len(dm)

    distances = (dm[i][j] for i, j in above_diagonal(bign) if included(levels[i], levels[j]))

    return stats.ss(distances)
コード例 #10
ファイル: permanova.py プロジェクト: theJohnnyBrown/permanova
def f_twoway(dm, levels):

    bign = len(levels)  # number of observations
    dm = np.asarray(dm)  # distance matrix
    l = len(set(levels))  # number of levels
    a = len(set([l[0] for l in levels]))  # number of a-levels
    b = len(set([l[1] for l in levels]))  # number of b-levels
    n = bign / float(a * b)  # number of observations per level

    # sum of all distances
    ## sst = np.sum(stats.ss(r) for r in
    ##         (s[n+1:] for n,s in enumerate(dm[:-1])) )/float(bign)
    sst = stats.ss(chain(*(r[i + 1 :] for i, r in enumerate(dm)))) / float(bign)

    # same level of both a and b (error, within-group)
    ssr = select_ss(dm, levels, lambda a, b: a == b) / float(n)

    # same level of a
    sswa = select_ss(dm, levels, lambda a, b: a[0] == b[0]) / float(b * n)

    # same level of b
    sswb = select_ss(dm, levels, lambda a, b: a[1] == b[1]) / float(a * n)

    ssa = sst - sswa  # effect of a
    ssb = sst - sswb  # effect of b
    ssab = sst - ssa - ssb - ssr  # interaction sum-of-squares

    # these should each be separate functions?
    f_interaction = (ssab / float((a - 1) * (b - 1))) / (ssr / float(bign - a * b))
    f_a = (ssa / float((a - 1))) / (ssr / float(bign - a * b))
    f_b = (ssb / float((b - 1))) / (ssr / float(bign - a * b))

    return (f_interaction, f_a, f_b)
コード例 #11
ファイル: permanova.py プロジェクト: wrshoemaker/permanova
def f_oneway(dm, levels):
    bign = len(levels)  #number of observations
    dm = np.asarray(dm)  #distance matrix
    a = len(set(levels))  #number of levels
    n = bign / a  #number of observations per level

    assert dm.shape == (bign, bign
                        )  #check the dist matrix is square and the size
    #corresponds to the length of levels

    #total sum of squared distances
    sst = np.sum(
        for r in (s[n + 1:] for n, s in enumerate(dm[:-1]))) / float(bign)

    #sum of within-group squares
    #itertools.combinations(xrange(len(dm)),2)#top half of dm
    ssw = np.sum((dm[i][j]**2
                  for i, j in product(xrange(len(dm)), xrange(1, len(dm)))
                  if i < j and levels[i] == levels[j])) / float(n)

    ssa = sst - ssw

    fstat = (ssa / float(a - 1)) / (ssw / float(bign - a))
    #print (fstat,sst,ssa,ssw,a,bign,n)

    return fstat
コード例 #12
ファイル: permanova.py プロジェクト: wrshoemaker/permanova
def f_twoway(dm, levels):

    bign = len(levels)  #number of observations
    dm = np.asarray(dm)  #distance matrix
    l = len(set(levels))  #number of levels
    a = len(set([l[0] for l in levels]))  #number of a-levels
    b = len(set([l[1] for l in levels]))  #number of b-levels
    n = bign / float(a * b)  #number of observations per level

    #sum of all distances
    ## sst = np.sum(stats.ss(r) for r in
    ##         (s[n+1:] for n,s in enumerate(dm[:-1])) )/float(bign)
    sst = stats.ss(chain(*(r[i + 1:] for i, r in enumerate(dm)))) / float(bign)

    #same level of both a and b (error, within-group)
    ssr = select_ss(dm, levels, lambda a, b: a == b) / float(n)

    #same level of a
    sswa = select_ss(dm, levels, lambda a, b: a[0] == b[0]) / float(b * n)

    #same level of b
    sswb = select_ss(dm, levels, lambda a, b: a[1] == b[1]) / float(a * n)

    ssa = sst - sswa  #effect of a
    ssb = sst - sswb  #effect of b
    ssab = sst - ssa - ssb - ssr  #interaction sum-of-squares

    #these should each be separate functions?
    f_interaction = (ssab / float(
        (a - 1) * (b - 1))) / (ssr / float(bign - a * b))
    f_a = (ssa / float((a - 1))) / (ssr / float(bign - a * b))
    f_b = (ssb / float((b - 1))) / (ssr / float(bign - a * b))

    return (f_interaction, f_a, f_b)
コード例 #13
ファイル: permanova.py プロジェクト: wrshoemaker/permanova
def select_ss(dm, levels, included):

    bign = len(dm)

    distances = (dm[i][j] for i, j in above_diagonal(bign)
                 if included(levels[i], levels[j]))

    return stats.ss(distances)
コード例 #14
def fastPearsonCorrelation(TC):
    N = TC.shape[1]
    corr, TCm = fastCovariance(TC)
    TCss = sqrt(ss(TCm, axis=1))
    for i in xrange(N):
        corr[i, i:] /= TCss[i:] * TCss[i]
        corr[i:, i] = corr[i, i:]
    return where(isfinite(corr), corr, 0.)
コード例 #15
def pearson(self, x, y):
    data = np.vstack((x, y))
    ms = data.mean(axis=1)[(slice(None, None, None), None)]
    datam = data - ms
    datass = np.sqrt(ss(datam, axis=1))
    temp = np.dot(datam[1:], datam[0].T)
    rs = temp / (datass[1:] * datass[0])
    return rs
    """ Two-way chi-square test of independence. 
コード例 #16
def pearson(x, y):
    """ Correlates row vector x with each row vector in 2D array y. """
    data = np.vstack((x, y))
    ms = data.mean(axis=1)[(slice(None, None, None), None)]
    datam = data - ms
    datass = np.sqrt(ss(datam, axis=1))
    temp = np.dot(datam[1:], datam[0].T)
    rs = temp / (datass[1:] * datass[0])
    return rs
コード例 #17
ファイル: stats.py プロジェクト: ambimorph/Neurosynth
def pearson(self, x, y):
  """ Correlates row vector x with each row vector in 2D array y. """
  data = np.vstack((x,y))
  ms = data.mean(axis=1)[(slice(None,None,None),None)]
  datam = data - ms
  datass = np.sqrt(ss(datam,axis=1))
  temp = np.dot(datam[1:],datam[0].T)
  rs = temp / (datass[1:]*datass[0])
  return rs
コード例 #18
ファイル: stats.py プロジェクト: dengemann/NeuroSynth
def pearson(self, x, y):
	data = np.vstack((x,y))
	ms = data.mean(axis=1)[(slice(None,None,None),None)]
	datam = data - ms
	datass = np.sqrt(ss(datam,axis=1))
	temp = np.dot(datam[1:],datam[0].T)
	rs = temp / (datass[1:]*datass[0])
 	return rs

	""" Two-way chi-square test of independence. 
コード例 #19
ファイル: sstat.py プロジェクト: yyfdemajia/Scripts
def repeated_oneway(data):

    n = data.shape[0]
    k = data.shape[1]
    grand_mean = np.mean(data)
    measurement_mean = np.mean(data, axis=0)
    subject_mean = np.mean(data, axis=1)
    ssb = n * st.ss(measurement_mean - grand_mean)
    #   ssw = st.ss(data-measurement_mean)
    ssw = np.sum(st.ss(data - measurement_mean))
    sss = k * st.ss(subject_mean - grand_mean)
    sse = ssw - sss
    dfb = k - 1
    dfe = (n - 1) * (k - 1)
    msb = ssb / float(dfb)
    mse = sse / float(dfe)
    f = msb / mse
    p = st.fprob(dfb, dfe, f)
    return f, p
コード例 #20
ファイル: sstat.py プロジェクト: SGenheden/Scripts
def repeated_oneway(data) :

  n = data.shape[0]
  k = data.shape[1]
  grand_mean = np.mean(data)
  measurement_mean = np.mean(data,axis=0)
  subject_mean = np.mean(data,axis=1)
  ssb = n*st.ss(measurement_mean-grand_mean)
#   ssw = st.ss(data-measurement_mean)
  ssw = np.sum(st.ss(data-measurement_mean))
  sss = k*st.ss(subject_mean-grand_mean)
  sse = ssw-sss
  dfb = k - 1
  dfe = (n-1)*(k-1)
  msb = ssb / float(dfb)
  mse = sse / float(dfe)
  f = msb / mse
  p = st.fprob(dfb,dfe,f)
  return f,p
コード例 #21
ファイル: utils.py プロジェクト: EntzeChong/PhyloDB
def f_oneway(dm, levels):
    bign = len(levels)
    dm = np.asarray(dm)
    a = len(set(levels))
    n = bign/a
    assert dm.shape == (bign, bign)
    sst = np.sum(stats.ss(r) for r in (s[n+1:] for n, s in enumerate(dm[:-1])))/float(bign)
    ssw = np.sum((dm[i][j]**2 for i, j in product(xrange(len(dm)), xrange(1, len(dm))) if i < j and levels[i] == levels[j]))/float(n)
    ssa = sst - ssw
    fstat = (ssa/float(a-1))/(ssw/float(bign-a))
    return fstat
コード例 #22
ファイル: stats.py プロジェクト: Xen4n/neurolearn
def pearson(x, y):
    """ Correlates row vector x with each row vector in 2D array y. 
    From neurosynth.stats.py - author: Tal Yarkoni
    data = np.vstack((x, y))
    ms = data.mean(axis=1)[(slice(None, None, None), None)]
    datam = data - ms
    datass = np.sqrt(ss(datam, axis=1))
    temp = np.dot(datam[1:], datam[0].T)
    rs = temp / (datass[1:] * datass[0])
    return rs
コード例 #23
ファイル: bootstrap.py プロジェクト: theunissenlab/tlab
def jackknife_bias_correct(pairs,confidence=None,return_all=False,
    Return jackknife-bias-corrected estimate from estimate-nsamples pairs
    Pairs can be either a list of tuples, or a 2 x nestimates array.
    If 'confidence' is between 0 and 1, return the mean with lower and upper
    bounds at -/+ the confidence interval.
    If 'confidence' is None, return the mean and standard error.
    If 'return_all' is True, return the mean, standard error, number of points,
    and confidence interval size.
    data = asarray(pairs)
    if nan_remove:
        data = data[isfinite(data)[:,0],:]
    y = data[:,0]
    x = 1./data[:,1]
    n = len(x)
    # Compute linear regression and standard error of intercept
    (slope,intercept,r,p,slope_se) = linregress(x,y)
    intercept_se = slope_se * sqrt(ss(x)/n)
    # Return mean and SE if no value is specified:
    if confidence is None:
        if return_all:
            if return_raw:
                np = data[:,1]
                max_n = max(np)
                raw_mean = mean(data[np==max_n,0])
                return intercept, intercept_se, n, raw_mean
                return intercept, intercept_se, n
            return intercept, intercept_se
    # Otherwise return intercept with confidence
        t_int = t._ppf((1+confidence)/2,n-2)
        intercept_int = t_int * intercept_se
        if return_all:
            if return_raw:
                np = data[:,1]
                max_n = max(np)
                raw_mean = mean(data[np==max_n,0])
                return intercept, intercept_se, n, intercept_int, raw_mean
                return intercept, intercept_se, n, intercept_int
            return intercept, intercept - intercept_int, intercept + intercept_int
コード例 #24
def test4(TC):
    t0 = time()
    ms = TC.mean(axis=0)[(slice(None, None, None), None)]
    TCm = TC.T - ms
    TCss = sqrt(ss(TCm, axis=1))
    N = TC.shape[1]
    corr = zeros((N, N))
    for i in xrange(N):
        corr[i, i:] = dot(TCm[i:], TCm[i].T)
        corr[i, i:] /= TCss[i:] * TCss[i]
        corr[i:, i] = corr[i, i:]
    print 'Pearson      ', time() - t0
    return corr
コード例 #25
def test5(TC):
    ''' Attention for TC which is modified by the function
    t0 = time()
    ms = TC.mean(axis=0)[(slice(None, None, None), None)]
    TC -= ms.T
    TCss = sqrt(ss(TC, axis=0))
    N = TC.shape[1]
    corr = zeros((N, N))
    for i in xrange(N):
        corr[i, i:] = dot(TC[:, i:].T, TC[:, i])
        corr[i, i:] /= TCss[i:] * TCss[i]
        corr[i:, i] = corr[i, i:]
    print 'Pearson      ', time() - t0
    return corr
コード例 #26
def f_oneway(dm, levels):
    bign = len(levels)
    dm = np.asarray(dm)
    a = len(set(levels))
    n = bign / a
    assert dm.shape == (bign, bign)
    sst = np.sum(
        for r in (s[n + 1:] for n, s in enumerate(dm[:-1]))) / float(bign)
    ssw = np.sum((dm[i][j]**2
                  for i, j in product(xrange(len(dm)), xrange(1, len(dm)))
                  if i < j and levels[i] == levels[j])) / float(n)
    ssa = sst - ssw
    fstat = (ssa / float(a - 1)) / (ssw / float(bign - a))
    return fstat
コード例 #27
ファイル: Practical1.py プロジェクト: jhylands/csteach
def simpleLeastSquares(X,Y):
    Compute the least-squares fit of y=ax+b
    Input: X is a list of sample x values, Y is a list of the
        corresponding Y values
    Output: A 2x1 matrix [a; b]
    # Complete this function

    # Calculate the values of matrices A and c
    # and return the value of p
    P = linalg.solve(A,c)
    return P
コード例 #28
def PearsonCorrelation(TC):
    #TODO change with coravriance, see Plot.py
    ''' Return the Pearson Correlation. The calculus is done for HALF of the matrix and duplicate for symetry.
    TC need to have a shape like [time, nodes]
    Attention for TC which is modified by the function
    from scipy.stats import ss
    from pylab import sqrt as np_sqrt, dot as np_dot
    TC -= TC.mean(axis=0)[(slice(None, None, None), None)].T
    TCss = np_sqrt(ss(TC, axis=0))
    N = TC.shape[1]
    corr = zeros((N, N))
    for i in xrange(N):
        corr[i, i:] = np_dot(TC[:, i:].T, TC[:, i])
        corr[i, i:] /= TCss[i:] * TCss[i]
        corr[i:, i] = corr[i, i:]
    return corr
コード例 #29
ファイル: correlation_matrix.py プロジェクト: stymy/pipelines
def write_correlation_matrix(in_file, mask_file, out_file):
    import nibabel as nb
    import numpy as np
    from scipy.stats import ss
    import os
    mask_nii = nb.load(mask_file)
    data_nii = nb.load(in_file)

    data = data_nii.get_data()[mask_nii.get_data() > 0, :]
    print(data.shape[0] * (data.shape[0] - 1) / 2)

    corr_matrix = np.memmap(out_file,
                            shape=(data.shape[0] * (data.shape[0] - 1) / 2))

    counter = 0
    ms = data.mean(axis=1)[(slice(None, None, None), None)]
    datam = data - ms
    datass = np.sqrt(ss(datam, axis=1))

    status = 0
    for i in xrange(0, data.shape[0]):
        temp = np.dot(datam[i + 1:], datam[i].T)
        rs = temp / (datass[i + 1:] * datass[i])
        corr_matrix[counter:counter + len(rs)] = rs * 10000
        counter += len(rs)

        if (counter / float(len(corr_matrix))) * 100 - status > 1:
            print "%d" % (counter / float(len(corr_matrix)) * 100)
            status = (counter / float(len(corr_matrix))) * 100

#    counter = 0
#    for i in range(data.shape[0]):
#        for j in range(i+1, data.shape[0]):
#            print "%g"%(counter/float(data.shape[0]*(data.shape[0]-1)/2))
#            r,_ = pearsonr(data[i,:], data[j,:])
#            corr_matrix[counter] = r
#            counter += 1
    del corr_matrix
    return os.path.abspath(out_file)
コード例 #30
ファイル: family.py プロジェクト: Wombatpm/statsmodels
    def loglike(self, endog, mu, scale=1.0):
        The log-likelihood in terms of the fitted mean response.

        endog : array-like
            Endogenous response variable
        mu : array-like
            Fitted mean response variable
        scale : float, optional
            Scales the loglikelihood function. The default is 1.

        llf : float
            The value of the loglikelihood function evaluated at
            (endog,mu,scale) as defined below.

        If the link is the identity link function then the
        loglikelihood function is the same as the classical OLS model.
        llf = -(nobs/2)*(log(SSR) + (1 + log(2*pi/nobs)))
        where SSR = sum((endog-link^(-1)(mu))**2)

        If the links is not the identity link then the loglikelihood
        function is defined as
        llf = sum((`endog`*`mu`-`mu`**2/2)/`scale` - `endog`**2/(2*`scale`) - \
        if isinstance(self.link, L.Power) and self.link.power == 1:
            # This is just the loglikelihood for classical OLS
            nobs2 = endog.shape[0] / 2.0
            SSR = ss(endog - self.fitted(mu))
            llf = -np.log(SSR) * nobs2
            llf -= (1 + np.log(np.pi / nobs2)) * nobs2
            return llf
            # Return the loglikelihood for Gaussian GLM
            return np.sum(
                (endog * mu - mu ** 2 / 2) / scale - endog ** 2 / (2 * scale) - 0.5 * np.log(2 * np.pi * scale)
コード例 #31
    def loglike(self, Y, mu, scale=1.):
        Loglikelihood function for Gaussian exponential family distribution.

        Y : array-like
            Endogenous response variable
        mu : array-like
            Fitted mean response variable
        scale : float, optional
            Scales the loglikelihood function. The default is 1.

        llf : float
            The value of the loglikelihood function evaluated at (Y,mu,scale)
            as defined below.

        If the link is the identity link function then the
        loglikelihood function is the same as the classical OLS model.
        llf = -(nobs/2)*(log(SSR) + (1 + log(2*pi/nobs)))
        where SSR = sum((Y-link^(-1)(mu))**2)

        If the links is not the identity link then the loglikelihood
        function is defined as
        llf = sum((`Y`*`mu`-`mu`**2/2)/`scale` - `Y`**2/(2*`scale`) - \
        if isinstance(self.link, L.Power) and self.link.power == 1:
        # This is just the loglikelihood for classical OLS
            nobs2 = Y.shape[0]/2.
            SSR = ss(Y-self.fitted(mu))
            llf = -np.log(SSR) * nobs2
            llf -= (1+np.log(np.pi/nobs2))*nobs2
            return llf
        # Return the loglikelihood for Gaussian GLM
            return np.sum((Y*mu-mu**2/2)/scale-Y**2/(2*scale)-\
コード例 #32
    def loglike(self, Y, mu, scale=1.):
        Loglikelihood function for Gaussian exponential family distribution.

        Y : array-like
            Endogenous response variable
        mu : array-like
            Fitted mean response variable
        scale : float, optional
            Scales the loglikelihood function. The default is 1.

        llf : float
            The value of the loglikelihood function evaluated at (Y,mu,scale)
            as defined below.

        If the link is the identity link function then the
        loglikelihood function is the same as the classical OLS model.
        llf = -(nobs/2)*(log(SSR) + (1 + log(2*pi/nobs)))
        where SSR = sum((Y-link^(-1)(mu))**2)

        If the links is not the identity link then the loglikelihood
        function is defined as
        llf = sum((`Y`*`mu`-`mu`**2/2)/`scale` - `Y`**2/(2*`scale`) - \
        if isinstance(self.link, L.Power) and self.link.power == 1:
            # This is just the loglikelihood for classical OLS
            nobs2 = Y.shape[0] / 2.
            SSR = ss(Y - self.fitted(mu))
            llf = -np.log(SSR) * nobs2
            llf -= (1 + np.log(np.pi / nobs2)) * nobs2
            return llf
            # Return the loglikelihood for Gaussian GLM
            return np.sum((Y*mu-mu**2/2)/scale-Y**2/(2*scale)-\
コード例 #33
ファイル: permanova.py プロジェクト: theJohnnyBrown/permanova
def f_oneway(dm, levels):
    bign = len(levels)  # number of observations
    dm = np.asarray(dm)  # distance matrix
    a = len(set(levels))  # number of levels
    n = bign / a  # number of observations per level

    assert dm.shape == (bign, bign)  # check the dist matrix is square and the size
    # corresponds to the length of levels

    # total sum of squared distances
    sst = np.sum(stats.ss(r) for r in (s[n + 1 :] for n, s in enumerate(dm[:-1]))) / float(bign)

    # sum of within-group squares
    # itertools.combinations(xrange(len(dm)),2)#top half of dm
    ssw = np.sum(
        (dm[i][j] ** 2 for i, j in product(xrange(len(dm)), xrange(1, len(dm))) if i < j and levels[i] == levels[j])
    ) / float(n)

    ssa = sst - ssw

    fstat = (ssa / float(a - 1)) / (ssw / float(bign - a))
    # print (fstat,sst,ssa,ssw,a,bign,n)

    return fstat
コード例 #34
ファイル: descstats.py プロジェクト: yarikoptic/pystatsmodels
def descstats(data, cols=None, axis=0):
    Prints descriptive statistics for one or multiple variables.

    data: numpy array
        `x` is the data

    v: list, optional
        A list of the column number or field names (for a recarray) of variables.
        Default is all columns.

    axis: 1 or 0
        axis order of data.  Default is 0 for column-ordered data.

    >>> descstats(data.exog,v=['x_1','x_2','x_3'])

    x = np.array(data)  # or rather, the data we're interested in
    if cols is None:
        #       if isinstance(x, np.recarray):
        #            cols = np.array(len(x.dtype.names))
        if not isinstance(x, np.recarray) and x.ndim == 1:
            x = x[:, None]

    if x.shape[1] == 1:
        desc = '''
    Univariate Descriptive Statistics

    Var. Name   %(name)12s
    Obs.          %(nobs)22i  Range                  %(range)22s
    Sum of Wts.   %(sum)22s  Coeff. of Variation     %(coeffvar)22.4g
    Mode          %(mode)22.4g  Skewness                %(skewness)22.4g
    Repeats       %(nmode)22i  Kurtosis                %(kurtosis)22.4g
    Mean          %(mean)22.4g  Uncorrected SS          %(uss)22.4g
    Median        %(median)22.4g  Corrected SS            %(ss)22.4g
    Variance      %(variance)22.4g  Sum Observations        %(sobs)22.4g
    Std. Dev.     %(stddev)22.4g
    ''' % {'name': cols, 'sum': 'N/A', 'nobs': len(x), 'mode': \
    stats.mode(x)[0][0], 'nmode': stats.mode(x)[1][0], \
    'mean': x.mean(), 'median': np.median(x), 'range': \
    '('+str(x.min())+', '+str(x.max())+')', 'variance': \
    x.var(), 'stddev': x.std(), 'coeffvar': \
    stats.variation(x), 'skewness': stats.skew(x), \
    'kurtosis': stats.kurtosis(x), 'uss': stats.ss(x),\
    'ss': stats.ss(x-x.mean()), 'sobs': np.sum(x)}

        #    ''' % {'name': cols[0], 'sum': 'N/A', 'nobs': len(x[cols[0]]), 'mode': \
        #    stats.mode(x[cols[0]])[0][0], 'nmode': stats.mode(x[cols[0]])[1][0], \
        #    'mean': x[cols[0]].mean(), 'median': np.median(x[cols[0]]), 'range': \
        #    '('+str(x[cols[0]].min())+', '+str(x[cols[0]].max())+')', 'variance': \
        #    x[cols[0]].var(), 'stddev': x[cols[0]].std(), 'coeffvar': \
        #    stats.variation(x[cols[0]]), 'skewness': stats.skew(x[cols[0]]), \
        #    'kurtosis': stats.kurtosis(x[cols[0]]), 'uss': stats.ss(x[cols[0]]),\
        #    'ss': stats.ss(x[cols[0]]-x[cols[0]].mean()), 'sobs': np.sum(x[cols[0]])}

        desc += '''

    1  %%          %12.4g
    5  %%          %12.4g
    10 %%          %12.4g
    25 %%          %12.4g

    50 %%          %12.4g

    75 %%          %12.4g
    90 %%          %12.4g
    95 %%          %12.4g
    99 %%          %12.4g
    ''' % tuple([
            stats.scoreatpercentile(x, per)
            for per in (1, 5, 10, 25, 50, 75, 90, 95, 99)
        t, p_t = stats.ttest_1samp(x, 0)
        M, p_M = sign_test(x)
        S, p_S = stats.wilcoxon(np.squeeze(x))

        desc += '''

    Tests of Location (H0: Mu0=0)
    Test                Statistic       Two-tailed probability
    Student's t      |  t %7.5f   Pr > |t|   <%.4f
    Sign             |  M %8.2f   Pr >= |M|  <%.4f
    Signed Rank      |  S %8.2f   Pr >= |S|  <%.4f

    ''' % (t, p_t, M, p_M, S, p_S)
# Should this be part of a 'descstats'
# in any event these should be split up, so that they can be called
# individually and only returned together if someone calls summary
# or something of the sort

    elif x.shape[1] > 1:
        desc ='''
    Var. Name   |     Obs.        Mean    Std. Dev.           Range

        # for recarrays with columns passed as names
        #        if isinstance(cols[0],str):
        #            for var in cols:
        #                desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \
        #%(range)20s" %  {'name': var, 'obs': len(x[var]), 'mean': x[var].mean(),
        #        'stddev': x[var].std(), 'range': '('+str(x[var].min())+', '\
        #                +str(x[var].max())+')'+os.linesep}
        #        else:
        for var in range(x.shape[1]):
            desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \
%(range)20s"         % {'name': var, 'obs': len(x[:,var]), 'mean': x[:,var].mean(),
            'stddev': x[:,var].std(), 'range': '('+str(x[:,var].min())+', '+\
        raise ValueError, "data not understood"

    return desc
コード例 #35
def statistics(stat, infile, outfile, previous_p3_mosaic_dir=None):
    """Calculate the statistics

    # Open the original file
    dataset = gdal.Open(infile, gdal.GA_ReadOnly)
    num_layers = dataset.RasterCount
    # get the projection information
    no_data_value, xsize, ysize, geo_trans, projection, data_type = get_geo_info(

    # get the numpy 3rd dimension array stack of the bands of image
    raster_stack = bands2layerstack(infile)

    # define the default output type format
    output_type = gdal.GDT_Float32

    # call built in numpy statistical functions, with a specified axis. if
    # axis=2 means it will calculate along the 'depth' axis, per pixel.
    # with the return being n by m, the shape of each band.
    # Calculate the median statistical
    if stat == 'median':
        new_array = np.nanmedian(raster_stack, axis=2)
        output_type = gdal.GDT_UInt16
    # Calculate the mean statistical
    if stat == 'mean':
        new_array = np.nanmean(raster_stack, axis=2)
        output_type = gdal.GDT_UInt16
    # Calculate the standard deviation
    if stat == 'std':
        new_array = np.nanstd(raster_stack, axis=2)
    # Calculate the valid data
    if stat == 'valid_data':
        # calculate the number of valid data used in statistics products in percentage (0-100%),
        # this count the valid data (no nans) across the layers (time axis)
        new_array = (num_layers -
                     np.isnan(raster_stack).sum(axis=2)) * 100 / num_layers
    # Calculate the signal-to-noise ratio
    if stat == 'snr':
        # this signal-to-noise ratio defined as the mean divided by the standard deviation.
        m = np.nanmean(raster_stack, axis=2)
        sd = np.nanstd(raster_stack, axis=2, ddof=0)
        new_array = np.where(sd == 0, 0, m / sd)
    # Calculate the coefficient of variation
    if stat == 'coeff_var':
        # the ratio of the biased standard deviation to the mean
        new_array = variation(raster_stack, axis=2, nan_policy='omit')
    # Calculate the Pearson's correlation coefficient
    if stat == 'pearson_corr':
        # https://github.com/scipy/scipy/blob/v0.14.0/scipy/stats/stats.py#L2392
        # get array of the previous mean file
        previous_dataset_file = os.path.join(
            os.path.basename(outfile).split('_pearson_corr.tif')[0] + '.tif')

        # get the numpy 3rd dimension array stack of the bands of image
        previous_raster_stack = bands2layerstack(previous_dataset_file)

        # raster_stack and previous_raster_stack should have same length in all axis
        if raster_stack.shape != previous_raster_stack.shape:
            z_rs = raster_stack.shape[2]
            z_prs = previous_raster_stack.shape[2]

            if z_rs > z_prs:
                raster_stack = np.delete(raster_stack, np.s_[z_prs - z_rs:], 2)
            if z_prs > z_rs:
                previous_raster_stack = np.delete(previous_raster_stack,
                                                  np.s_[z_rs - z_prs:], 2)

        # propagate the nan values across the pair values in the same position for the
        # two raster in both directions
        mask1 = np.isnan(raster_stack)
        mask2 = np.isnan(previous_raster_stack)
        combined_mask = mask1 | mask2
        raster_stack = np.where(combined_mask, np.nan, raster_stack)
        previous_raster_stack = np.where(combined_mask, np.nan,
        del mask1, mask2, combined_mask

        mean_rs = np.nanmean(raster_stack, axis=2, keepdims=True)
        mean_prs = np.nanmean(previous_raster_stack, axis=2, keepdims=True)
        m_rs = np.nan_to_num(raster_stack - mean_rs)
        m_prs = np.nan_to_num(previous_raster_stack - mean_prs)
        r_num = np.add.reduce(m_rs * m_prs, axis=2)
        r_den = np.sqrt(ss(m_rs, axis=2) * ss(m_prs, axis=2))
        r = r_num / r_den

        # return the r coefficient -1 to 1
        new_array = r

    #### create the output geo tif
    # Set up the GTiff driver
    driver = gdal.GetDriverByName('GTiff')

    new_dataset = driver.Create(outfile, xsize, ysize, 1, output_type,
                                ["COMPRESS=LZW", "PREDICTOR=2", "TILED=YES"])
    # the '1' is for band 1
    # Write the array
コード例 #36
def descstats(data, cols=None, axis=0):
    Prints descriptive statistics for one or multiple variables.

    data: numpy array
        `x` is the data

    v: list, optional
        A list of the column number or field names (for a recarray) of variables.
        Default is all columns.

    axis: 1 or 0
        axis order of data.  Default is 0 for column-ordered data.



    x = np.array(data)  # or rather, the data we're interested in
    if cols is None:
#       if isinstance(x, np.recarray):
#            cols = np.array(len(x.dtype.names))
        if not isinstance(x, np.recarray) and x.ndim == 1:
            x = x[:,None]

    if x.shape[1] == 1:
        desc = '''
    Univariate Descriptive Statistics

    Var. Name   %(name)12s
    Obs.          %(nobs)22i  Range                  %(range)22s
    Sum of Wts.   %(sum)22s  Coeff. of Variation     %(coeffvar)22.4g
    Mode          %(mode)22.4g  Skewness                %(skewness)22.4g
    Repeats       %(nmode)22i  Kurtosis                %(kurtosis)22.4g
    Mean          %(mean)22.4g  Uncorrected SS          %(uss)22.4g
    Median        %(median)22.4g  Corrected SS            %(ss)22.4g
    Variance      %(variance)22.4g  Sum Observations        %(sobs)22.4g
    Std. Dev.     %(stddev)22.4g
    ''' % {'name': cols, 'sum': 'N/A', 'nobs': len(x), 'mode': \
    stats.mode(x)[0][0], 'nmode': stats.mode(x)[1][0], \
    'mean': x.mean(), 'median': np.median(x), 'range': \
    '('+str(x.min())+', '+str(x.max())+')', 'variance': \
    x.var(), 'stddev': x.std(), 'coeffvar': \
    stats.variation(x), 'skewness': stats.skew(x), \
    'kurtosis': stats.kurtosis(x), 'uss': stats.ss(x),\
    'ss': stats.ss(x-x.mean()), 'sobs': np.sum(x)}

#    ''' % {'name': cols[0], 'sum': 'N/A', 'nobs': len(x[cols[0]]), 'mode': \
#    stats.mode(x[cols[0]])[0][0], 'nmode': stats.mode(x[cols[0]])[1][0], \
#    'mean': x[cols[0]].mean(), 'median': np.median(x[cols[0]]), 'range': \
#    '('+str(x[cols[0]].min())+', '+str(x[cols[0]].max())+')', 'variance': \
#    x[cols[0]].var(), 'stddev': x[cols[0]].std(), 'coeffvar': \
#    stats.variation(x[cols[0]]), 'skewness': stats.skew(x[cols[0]]), \
#    'kurtosis': stats.kurtosis(x[cols[0]]), 'uss': stats.ss(x[cols[0]]),\
#    'ss': stats.ss(x[cols[0]]-x[cols[0]].mean()), 'sobs': np.sum(x[cols[0]])}

        desc+= '''

    1  %%          %12.4g
    5  %%          %12.4g
    10 %%          %12.4g
    25 %%          %12.4g

    50 %%          %12.4g

    75 %%          %12.4g
    90 %%          %12.4g
    95 %%          %12.4g
    99 %%          %12.4g
    ''' % tuple([stats.scoreatpercentile(x,per) for per in (1,5,10,25,

        desc+= '''

    Tests of Location (H0: Mu0=0)
    Test                Statistic       Two-tailed probability
    Student's t      |  t %7.5f   Pr > |t|   <%.4f
    Sign             |  M %8.2f   Pr >= |M|  <%.4f
    Signed Rank      |  S %8.2f   Pr >= |S|  <%.4f

    ''' % (t,p_t,M,p_M,S,p_S)
# Should this be part of a 'descstats'
# in any event these should be split up, so that they can be called
# individually and only returned together if someone calls summary
# or something of the sort

    elif x.shape[1] > 1:
        desc ='''
    Var. Name   |     Obs.        Mean    Std. Dev.           Range

# for recarrays with columns passed as names
#        if isinstance(cols[0],str):
#            for var in cols:
#                desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \
#%(range)20s" %  {'name': var, 'obs': len(x[var]), 'mean': x[var].mean(),
#        'stddev': x[var].std(), 'range': '('+str(x[var].min())+', '\
#                +str(x[var].max())+')'+os.linesep}
#        else:
        for var in range(x.shape[1]):
                desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \
%(range)20s" % {'name': var, 'obs': len(x[:,var]), 'mean': x[:,var].mean(),
                'stddev': x[:,var].std(), 'range': '('+str(x[:,var].min())+', '+\
        raise ValueError, "data not understood"

    return desc