def HybridNormalGPDPDF(xs, u, mu, sigma, shape, loc, scale):
    '''
    Params: 
        xs: unsorted list of datat to fit semi-parametric PDF to.
        u: threshold to move from Gaussian PDF Fit in center to GPD tail fitting.
        mu:  mean of the data.
        sigma: standard deviation of the data.
        shape: gpd least squares estimated shape parameter.
        loc: gpd least squares estimated location parameter.
        scale: gpd least squares estimated scale parameter.
    Returns:
        an array that would result from xs.apply(semiparametric_fittedfunction) or F_n(xs) where F_n is the PDF fit.
    '''
    out = list()
    l = (mu - abs(u - mu))
    h = (mu + abs(u - mu))
    #print('u = %.10f,l = %.10f,h = %.10f'%(u,l,h))
    for x in xs:
        if x < l:
            out.append(
                norm.cdf(l, mu, sigma) *
                genpareto.pdf(l - x, shape, loc=loc, scale=scale))
        elif x >= h:
            out.append((1 - norm.cdf(h, mu, sigma)) *
                       genpareto.pdf(x - h, shape, loc=loc, scale=scale))
        else:
            out.append(norm.pdf(x, mu, sigma))
    return out
    def loss_function(abg):
#            #SIMPLE: Penalize negative a's, we want a positive, b/c for a<0, the algorithm is different:
#            if min(abg)<.0 or max(abg) > 5.:
#                return 1e6#
            error = .0;
            for (phi_m, phi_idx) in zip(phis, xrange(N_phi)):
                Is = bins[phi_m]['Is']
                uniqueIs = bins[phi_m]['unique_Is']
                 
                a,b,g = abg[0], abg[1], abg[2]
                movingThreshold = getMovingThreshold(a,g, phi_m)
                
                LHS_numerator = movingThreshold(uniqueIs[1:]) *sqrt(2.)
                LHS_denominator = b * sqrt(1 - exp(-2*uniqueIs[1:]))
                LHS = 1 -  norm.cdf(LHS_numerator / LHS_denominator)
                
                RHS = zeros_like(LHS)
                N  = len(Is)
                for rhs_idx in xrange(1,len(uniqueIs)):
                    t = uniqueIs[rhs_idx]
                    lIs = Is[Is<t]
                    taus = t - lIs;
                    
                    numerator = (movingThreshold(t) - movingThreshold(lIs)* exp(-taus)) * sqrt(2.)
                    denominator = b *  sqrt(1. - exp(-2*taus))
                    RHS[rhs_idx-1] = sum(1. - norm.cdf(numerator/denominator)) / N
                
#                error += sum(abs(LHS - RHS));
                error += sum((LHS - RHS)**2);
#                error += max(abs(LHS - RHS))

            return error
        def fun(params):
            """
            Negative log-likelihood of z-scores.

            The function has three arguments, packed into a vector:

            mean : location parameter
            logscale : log of the scale parameter
            logitprop : logit of the proportion of true nulls

            The implementation follows section 4 from Efron 2008.
            """

            d, s, p = xform(params)

            # Mass within the central region
            central_mass = (norm.cdf((null_ub - d) / s) -
                            norm.cdf((null_lb - d) / s))

            # Probability that a Z-score is null and is in the central region
            cp = p * central_mass

            # Binomial term
            rval = n_zs0 * np.log(cp) + (n_zs - n_zs0) * np.log(1 - cp)

            # Truncated Gaussian term for null Z-scores
            zv = (zscores0 - d) / s
            rval += np.sum(-zv**2 / 2) - n_zs0 * np.log(s)
            rval -= n_zs0 * np.log(central_mass)

            return -rval
Beispiel #4
0
def plotlmmse():
    nsample = 10**5
    snrlst = range(0, 19)

    plt.subplot(211)
    pe = []
    for snr in snrlst:
        coeff = lmmse_coeff(tap1, 41, snr)
        delta_square = 10**(-snr / 10.) * sum(coeff**2)
        pe.append(1 - norm.cdf(sqrt((1 - 0.7)**2 / delta_square)))
    plt.semilogy(snrlst, pe, snrlst, equalizer(2, 41, snrlst, nsample,
                                               'lmmse'), "-.")
    plt.legend(("Theoretical curve", "Simulated curve"), loc='lower left')
    plt.title("Theoretical vs. Simulated performances for Channel 1")
    plt.xlabel("SNR (dB)")
    plt.ylabel("SER (dB)")
    plt.grid(True, which='both')

    plt.subplot(212)
    pe = []
    for snr in snrlst:
        coeff = lmmse_coeff(tap2, 41, snr)
        delta_square = 10**(-snr / 10.) * sum(coeff**2)
        pe.append(1 - norm.cdf(sqrt((1 - 0.41)**2 / delta_square)))
    plt.semilogy(snrlst, pe, snrlst, equalizer(2, 41, snrlst, nsample,
                                               'lmmse'), "-.")
    plt.legend(("Theoretical curve", "Simulated curve"), loc='lower left')
    plt.title("Theoretical vs. Simulated performances for Channel 2")
    plt.xlabel("SNR (dB)")
    plt.ylabel("SER (dB)")
    plt.grid(True, which='both')
    plt.show()
        def fun(params):
            """
            Negative log-likelihood of z-scores.

            The function has three arguments, packed into a vector:

            mean : location parameter
            logscale : log of the scale parameter
            logitprop : logit of the proportion of true nulls

            The implementation follows section 4 from Efron 2008.
            """

            d, s, p = xform(params)

            # Mass within the central region
            central_mass = (norm.cdf((null_ub - d) / s) - norm.cdf(
                (null_lb - d) / s))

            # Probability that a Z-score is null and is in the central region
            cp = p * central_mass

            # Binomial term
            rval = n_zs0 * np.log(cp) + (n_zs - n_zs0) * np.log(1 - cp)

            # Truncated Gaussian term for null Z-scores
            zv = (zscores0 - d) / s
            rval += np.sum(-zv**2 / 2) - n_zs0 * np.log(s)
            rval -= n_zs0 * np.log(central_mass)

            return -rval
def DemoivreLaplaceApprox(k,l,n,p): #return [np, binP], not accurate
	np=float(n)*p;
	base=sqrt(np*(1-p));
	lf=float(l);
	kf=float(k);

	return [np,norm.cdf((lf+0.5-np)/base)-norm.cdf((kf-0.5-np)/base)];
Beispiel #7
0
def BlackScholes(reTime, rf, S, K, sigma):
	d1=(log(S/K)+(rf+sigma**2/2)*reTime)/sigma*sqrt(reTime)
	d2=d1-sigma*sqrt(reTime)
	call_BS = (S*norm.cdf(d1,0,1)-K*exp(-rf*reTime)*norm.cdf(d2,0,1))
	put_BS = K*exp(-rf*reTime)*norm.cdf(-d2,0,1)-S*norm.cdf(-d1,0,1)
	delta=norm.cdf(d1,0,1)
	gamma=norm.pdf(d1,0,1)/(S*sigma*sqrt(reTime))
	vega=S*norm.pdf(d1)*np.sqrt(reTime)
	theta=-.5*S*norm.pdf(d1)*sigma/np.sqrt(reTime)
	return {'call_BS':call_BS,'put_BS':put_BS,'delta':delta,'gamma':gamma,'vega':vega,'theta':theta}
Beispiel #8
0
def QQPlot(DataValues, Alpha_CI=0.95, DataLabel='Data', FigFile='QQPlot.png'):

    ### Based on: https://www.tjmahr.com/quantile-quantile-plots-from-scratch/
    ### Itself based on Fox book: Fox, J. (2015)
    ### Applied Regression Analysis and Generalized Linear Models.
    ### Sage Publications, Thousand Oaks, California.

    # Data analysis
    N = len(DataValues)
    X_Bar = np.mean(DataValues)
    S_X = np.std(DataValues,ddof=1)

    # Sort data to get the rank
    Data_Sorted = np.zeros(N)
    Data_Sorted += DataValues
    Data_Sorted.sort()

    # Compute quantiles
    EmpiricalQuantiles = np.arange(0.5, N + 0.5) / N
    TheoreticalQuantiles = norm.ppf(EmpiricalQuantiles, X_Bar, S_X)
    ZQuantiles = norm.ppf(EmpiricalQuantiles,0,1)

    # Compute data variance
    DataIQR = np.quantile(DataValues, 0.75) - np.quantile(DataValues, 0.25)
    NormalIQR = np.sum(np.abs(norm.ppf(np.array([0.25, 0.75]), 0, 1)))
    Variance = DataIQR / NormalIQR
    Z_Space = np.linspace(min(ZQuantiles), max(ZQuantiles), 100)
    Variance_Line = Z_Space * Variance + np.median(DataValues)

    # Compute alpha confidence interval (CI)
    Z_SE = np.sqrt(norm.cdf(Z_Space) * (1 - norm.cdf(Z_Space)) / N) / norm.pdf(Z_Space)
    Data_SE = Z_SE * Variance
    Z_CI_Quantile = norm.ppf(np.array([(1 - Alpha_CI) / 2]), 0, 1)

    # Create point in the data space
    Data_Space = np.linspace(min(TheoreticalQuantiles), max(TheoreticalQuantiles), 100)

    # QQPlot
    BorderSpace = max( 0.05*abs(Data_Sorted.min()), 0.05*abs(Data_Sorted.max()))
    Y_Min = Data_Sorted.min() - BorderSpace
    Y_Max = Data_Sorted.max() + BorderSpace
    Figure, Axes = plt.subplots(1, 1, figsize=(5.5, 4.5), dpi=100)
    Axes.plot(TheoreticalQuantiles, Data_Sorted, linestyle='none', marker='o', mew=0.5, fillstyle='none', color=(0, 0, 0), label=DataLabel)
    Axes.plot(Data_Space, Variance_Line, linestyle='--', color=(1, 0, 0), label='Variance :' + str(format(np.round(Variance, 2),'.2f')))
    Axes.plot(Data_Space, Variance_Line + Z_CI_Quantile * Data_SE, linestyle='--', color=(0, 0, 1), label=str(int(100*Alpha_CI)) + '% CI')
    Axes.plot(Data_Space, Variance_Line - Z_CI_Quantile * Data_SE, linestyle='--', color=(0, 0, 1))
    plt.xlabel('Theoretical quantiles (-)')
    plt.ylabel('Empirical quantiles (-)')
    plt.ylim([Y_Min, Y_Max])
    plt.legend(loc='upper center', ncol=3, bbox_to_anchor=(0.5, 1.15), prop={'size':10})
    plt.savefig(FigFile)
    plt.show()
    plt.close(Figure)

    return Variance
    def testSolveIWithDispersionMatchesMorrisk0Disp(self):
        """The solveIWithDispersionDimensional method matches results from
        Morris et al 2015.
        """
        time = np.linspace(0, 7, 1.4e5) #1000 pts per second
        time_step = time[1] - time[0]
        num_time_pts = len(time)
        dE = 0
        freq = 0
        Ru = 0
        Cdl = 0
        Cdl1 = 0
        Cdl2 = 0
        Cdl3 = 0
        EStart = -0.2
        ERev = 0.5
        temp = 293
        nu = 0.1

        area = 1
        coverage = 1e-11
        k_0BinsUnscaled = np.linspace(-7, 7, 15)
        #Define wE in terms of bin widths
        leftBinEnds = np.empty(15)
        leftBinEnds[1:] = np.linspace(-6.5, 6.5, 14)
        leftBinEnds[0] = -np.inf
        rightBinEnds = np.empty(15)
        rightBinEnds[:-1] = np.linspace(-6.5, 6.5, 14)
        rightBinEnds[-1] = np.inf
        wK = norm.cdf(rightBinEnds, loc=0, scale=2) -\
        norm.cdf(leftBinEnds, loc=0, scale=2)

        expWidth = np.array([0.124, 0.128, 0.134, 0.141, 0.150, 0.161,
                             0.172, 0.185, 0.198, 0.211])

        self.assertEqual(np.sum(wK), 1)

        for m, ew in zip(range(1, 11), expWidth):
            k_0Vals = 0.1 * 2**(0.1*m*k_0BinsUnscaled)
            bins = [(0, k_0, w) for k_0, w in zip(k_0Vals, wK)]
            I, amt = st.solve_reaction_disp_dim_bins(time_step,
                        num_time_pts, dE, freq, Ru, Cdl, Cdl1, Cdl2, Cdl3,
                        EStart, ERev, temp, nu, area, coverage, bins)

            width = st.half_maximum_width(I, time, nu)
            err = abs(width - ew)
            #Rounding error + 2*step size + error in I
            #(estimated at 1*stepsize)
            self.assertLess(err, 7e-4)
def compare_medians_ms(group_1, group_2, axis=None):
    """
    Compares the medians from two independent groups along the given axis.

    The comparison is performed using the McKean-Schrader estimate of the
    standard error of the medians.

    Parameters
    ----------
    group_1 : array_like
        First dataset.
    group_2 : array_like
        Second dataset.
    axis : int, optional
        Axis along which the medians are estimated. If None, the arrays are
        flattened.  If `axis` is not None, then `group_1` and `group_2`
        should have the same shape.

    Returns
    -------
    compare_medians_ms : {float, ndarray}
        If `axis` is None, then returns a float, otherwise returns a 1-D
        ndarray of floats with a length equal to the length of `group_1`
        along `axis`.

    """
    (med_1, med_2) = (ma.median(group_1, axis=axis), ma.median(group_2, axis=axis))
    (std_1, std_2) = (mstats.stde_median(group_1, axis=axis), mstats.stde_median(group_2, axis=axis))
    W = np.abs(med_1 - med_2) / ma.sqrt(std_1 ** 2 + std_2 ** 2)
    return 1 - norm.cdf(W)
Beispiel #11
0
def main():
    gamma = np.arange(2, 6.01, 0.01)
    delta = np.arange(4, 12.01, 0.01)
    c = np.zeros((len(gamma), len(delta)))

    # Calculate the expectation for coverage area
    for i in range(len(gamma)):
        for j in range(len(delta)):
            b = 10 * gamma[i] * log10(e) / delta[j]
            c[i, j] = 0.5 + exp(1 / b**2) * norm.cdf(-2 / b)
    print np.max(c), np.min(c)

    # Plotting
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    X, Y = np.meshgrid(delta, gamma)
    surf = ax.plot_surface(X,
                           Y,
                           c,
                           rstride=5,
                           cstride=5,
                           cmap=cm.jet,
                           linewidth=1,
                           antialiased=True)
    ax.set_zlim3d(np.min(c), np.max(c))
    ax.set_xlabel('delta')
    ax.set_ylabel('gamma')
    ax.set_zlabel('coverage')
    fig.colorbar(surf)
    plt.show()
def compare_medians_ms(group_1, group_2, axis=None):
    """
    Compares the medians from two independent groups along the given axis.

    The comparison is performed using the McKean-Schrader estimate of the
    standard error of the medians.

    Parameters
    ----------
    group_1 : array_like
        First dataset.
    group_2 : array_like
        Second dataset.
    axis : int, optional
        Axis along which the medians are estimated. If None, the arrays are
        flattened.  If `axis` is not None, then `group_1` and `group_2`
        should have the same shape.

    Returns
    -------
    compare_medians_ms : {float, ndarray}
        If `axis` is None, then returns a float, otherwise returns a 1-D
        ndarray of floats with a length equal to the length of `group_1`
        along `axis`.

    """
    (med_1, med_2) = (ma.median(group_1, axis=axis), ma.median(group_2, axis=axis))
    (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
                      mstats.stde_median(group_2, axis=axis))
    W = np.abs(med_1 - med_2) / ma.sqrt(std_1 ** 2 + std_2 ** 2)
    return 1 - norm.cdf(W)
def generate_ordinal():

    ## Regression coefficients
    beta = np.zeros(5, dtype=np.float64)
    beta[2] = 1
    beta[4] = -1

    rz = 0.5

    OUT = open("gee_ordinal_1.csv", "w")

    for i in range(200):

        n = np.random.randint(3, 6)  # Cluster size

        x = np.random.normal(size=(n, 5))
        for j in range(5):
            x[:, j] += np.random.normal()
        pr = np.dot(x, beta)
        pr = np.array([1, 0, -0.5]) + pr[:, None]
        pr = 1 / (1 + np.exp(-pr))

        z = rz*np.random.normal() +\
            np.sqrt(1-rz**2)*np.random.normal(size=n)
        u = norm.cdf(z)

        y = (u[:, None] > pr).sum(1)

        for j in range(n):
            OUT.write("%d,%d," % (i, y[j]))
            OUT.write(",".join(["%.3f" % b for b in x[j, :]]) + "\n")

    OUT.close()
Beispiel #14
0
def weightedUtest(g1, w1, g2, w2):
    """ Determines the confidence level of the assertion:
    'The values of g2 are higher than those of g1'.  
    (adapted from the scipy.stats version)
    
    Twist: here the elements of each group have associated weights, 
    corresponding to how often they are present (i.e. two identical entries with 
    weight w are equivalent to one entry with weight 2w).
    Reference: "Studies in Continuous Black-box Optimization", Schaul, 2011 [appendix B].
    
    TODO: make more efficient for large sets. 
    """
    from scipy.stats.distributions import norm
    import numpy
    n1 = sum(w1)
    n2 = sum(w2)
    u1 = 0.
    for x1, wx1 in zip(g1, w1):
        for x2, wx2 in zip(g2, w2):
            if x1 == x2:
                u1 += 0.5 * wx1 * wx2
            elif x1 > x2:
                u1 += wx1 * wx2
    mu = n1 * n2 / 2.
    sigu = numpy.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.)
    z = (u1 - mu) / sigu
    conf = norm.cdf(z)
    return conf
Beispiel #15
0
def Z_test(x1, x2, Alpha=0.95):
    ResultsTable = pd.DataFrame()

    # Compute standard deviation and number of observation
    S_x1 = x1.std(ddof=1)
    S_x2 = x2.std(ddof=1)
    N_x1 = len(x1)
    N_x2 = len(x2)

    # Test statistic and p value
    Z = (x1.mean() - x2.mean()) / np.sqrt(S_x1**2 / N_x1 + S_x2**2 / N_x2)
    p = 2 * (1 - norm.cdf(abs(Z)))

    # Rejection range
    MinValue = norm.ppf((1 - Alpha) / 2)
    MaxValue = norm.ppf(1 - (1 - Alpha) / 2)
    RejectionRange = np.array([[-np.inf, round(MinValue, 3)],
                               [round(MaxValue, 3), np.inf]])

    Results = {
        'Test statistic': round(Z, 3),
        'p value': round(p, 9),
        'Significance level (%)': Alpha * 100,
        'Rejection range': RejectionRange
    }

    ResultsTable = ResultsTable.append(Results, ignore_index=True)

    return ResultsTable
Beispiel #16
0
def compare_medians_ms(group_1, group_2, axis=None):
    """Compares the medians from two independent groups along the given axis.

The comparison is performed using the McKean-Schrader estimate of the standard
error of the medians.

Parameters
----------
    group_1 : {sequence}
        First dataset.
    group_2 : {sequence}
        Second dataset.
    axis : {integer}
        Axis along which the medians are estimated. If None, the arrays are flattened.

Returns
-------
    A (p,) array of comparison values.

    """
    (med_1, med_2) = (ma.median(group_1,
                                axis=axis), ma.median(group_2, axis=axis))
    (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
                      mstats.stde_median(group_2, axis=axis))
    W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
    return 1 - norm.cdf(W)
Beispiel #17
0
def skewtest(a,axis=-1):
    """Tests whether the skew is significantly different from a normal distribution.

    Axis can equal None (ravel array first), an integer (the axis over
    which to operate), or a sequence (operate over multiple axes).

    NOTE: This function is mostly copied from scipy.stats.stats, but
    corrects for a major bug: the pvalue returned by SciPy is not valid
    when the skewness is negative! The return values also are slightly
    different: the skew is actually returned will the z-score is not.

    Returns: skewness and 2-tail z-probability
    """
    a, axis = _chk_asarray(a, axis)
    if axis is None:
        a = ravel(a)
        axis = 0
    skewness = skew(a,axis)
    n = float(a.shape[axis])
    if n<8:
        print "skewtest only valid for n>=8 ... continuing anyway, n=",n
    y = skewness * sqrt(((n+1)*(n+3)) / (6.0*(n-2)) )
    beta2 = ( 3.0*(n*n+27*n-70)*(n+1)*(n+3) ) / ( (n-2.0)*(n+5)*(n+7)*(n+9) )
    W2 = -1 + sqrt(2*(beta2-1))
    delta = 1/sqrt(log(sqrt(W2)))
    alpha = sqrt(2.0/(W2-1))
    y = where(equal(y,0),1,y)
    Z = delta*log(y/alpha + sqrt((y/alpha)**2+1))
    
    # The two-tailed p-value is twice the prob that value of a std normal r.v.
    # turns out to be greater than the (absolute) value of Z
    pvalue = 2*( 1 - norm.cdf(abs(Z)) )
    assert pvalue >= 0.0 and pvalue <= 1.0
    return skewness, pvalue
def MannWhitneyUTest(x,y):

    Nx = len(x)
    Ny = len(y)

    XData = pd.DataFrame({'Values': x, 'Group': 'Control'}, index=range(len(x)))
    YData = pd.DataFrame({'Values': y, 'Group': 'Test'}, index=range(len(y)))

    Pool = XData.append(YData, ignore_index=True)
    Pool['Ranks'] = Pool['Values'].rank(method='average')

    R1 = Pool[Pool['Group']=='Control']['Ranks'].sum()
    U1 = R1 - (Nx * (Nx+1)) / 2
    U2 = Nx * Ny - U1

    U = max(U1, U2)

    UMean = Nx * Ny / 2
    UStd  = np.sqrt((Nx * Ny * (Nx + Ny + 1)) / 12)

    # Transform into the z space
    from scipy.stats.distributions import norm
    z = (U - UMean) / UStd
    p = 2 * (1 - norm.cdf(abs(z)))

    return U, p
Beispiel #19
0
def compare_medians_ms(group_1, group_2, axis=None):
    """Compares the medians from two independent groups along the given axis.

The comparison is performed using the McKean-Schrader estimate of the standard
error of the medians.

Parameters
----------
    group_1 : {sequence}
        First dataset.
    group_2 : {sequence}
        Second dataset.
    axis : {integer}
        Axis along which the medians are estimated. If None, the arrays are flattened.

Returns
-------
    A (p,) array of comparison values.

    """
    (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
    (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
                      mstats.stde_median(group_2, axis=axis))
    W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
    return 1 - norm.cdf(W)
Beispiel #20
0
def skewtest(a, axis=-1):
    """Tests whether the skew is significantly different from a normal distribution.

    Axis can equal None (ravel array first), an integer (the axis over
    which to operate), or a sequence (operate over multiple axes).

    NOTE: This function is mostly copied from scipy.stats.stats, but
    corrects for a major bug: the pvalue returned by SciPy is not valid
    when the skewness is negative! The return values also are slightly
    different: the skew is actually returned will the z-score is not.

    Returns: skewness and 2-tail z-probability
    """
    a, axis = _chk_asarray(a, axis)
    if axis is None:
        a = ravel(a)
        axis = 0
    skewness = skew(a, axis)
    n = float(a.shape[axis])
    if n < 8:
        print "skewtest only valid for n>=8 ... continuing anyway, n=", n
    y = skewness * sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2)))
    beta2 = (3.0 * (n * n + 27 * n - 70) * (n + 1) *
             (n + 3)) / ((n - 2.0) * (n + 5) * (n + 7) * (n + 9))
    W2 = -1 + sqrt(2 * (beta2 - 1))
    delta = 1 / sqrt(log(sqrt(W2)))
    alpha = sqrt(2.0 / (W2 - 1))
    y = where(equal(y, 0), 1, y)
    Z = delta * log(y / alpha + sqrt((y / alpha)**2 + 1))

    # The two-tailed p-value is twice the prob that value of a std normal r.v.
    # turns out to be greater than the (absolute) value of Z
    pvalue = 2 * (1 - norm.cdf(abs(Z)))
    assert pvalue >= 0.0 and pvalue <= 1.0
    return skewness, pvalue
Beispiel #21
0
def copula(num_samples, rho_mat, mu_mat, methods):
    """Copula procedure to generate an OTU table with corrs close to rho_mat.
    Inputs:
     num_samples - int, number of samples. 
     rho_mat - 2d arr, symmetric positive definite matrix which specifies the 
     correlation or covariation between the otu's in the table. 
     mu_mat - 1d arr w/ len(num_otus), mean of otu for multivariate random call.
     methods - list of lists w/ len(num_otus), each list has a variable number 
     of elements. the first element in each list is the 
     scipy.stats.distributions function like lognorm or beta. this is the 
     function that we draw values from for the actual otu. the remaining entries
     are the parameters for that function in order that the function requires 
     them.
    """
    num_otus = len(mu_mat)
    # draw from multivariate normal distribution with specified parameters.
    # transpose so that it remains otuXsample matrix.
    Z = multivariate_normal(mean=mu_mat, cov=rho_mat, size=num_samples).T
    # using the inverse cdf of the normal distribution find where each sample
    # value for each otu falls in the normal cdf.
    U = norm.cdf(Z)
    # make the otu table using the methods and cdf values. ppf_args[0] is the
    # distribution function (eg. lognorm) whose ppf function we will use
    # to transform the cdf vals into the new distribution. ppf_args[1:] is the
    # params of the function like a, b, size, loc etc.
    otu_table = array([
        ppf_args[0].ppf(otu_cdf_vals, *ppf_args[1:], size=num_otus)
        for ppf_args, otu_cdf_vals in zip(methods, U)
    ])
    return where(otu_table > 0, otu_table, 0)
Beispiel #22
0
def param_table(beta, y_name, x_names, sigma=None):
    # basic frame
    frame = pd.DataFrame({
        'coeff': beta,
    }, index=x_names)
    frame = frame.rename_axis(y_name, axis=1)

    # handle sigma cases
    if sigma is None:
        return frame
    elif type(sigma) is tuple:
        sigr, sigc = sigma
        stderr = np.sqrt(np.hstack([maybe_diag(sigr), sigc]))
    else:
        stderr = np.sqrt(maybe_diag(sigma))

    # confidence interval
    low95 = beta - z95 * stderr
    high95 = beta + z95 * stderr

    # p-value
    zscore = beta / stderr
    pvalue = 2 * (1 - norm.cdf(np.abs(zscore)))

    # stderr stats
    frame = frame.assign(stderr=stderr,
                         low95=low95,
                         high95=high95,
                         pvalue=pvalue)

    return frame
def generate_ordinal():

    ## Regression coefficients
    beta = np.zeros(5, dtype=np.float64)
    beta[2] = 1
    beta[4] = -1

    rz = 0.5

    OUT = open("gee_ordinal_1.csv", "w")

    for i in range(200):

        n = np.random.randint(3, 6) # Cluster size

        x = np.random.normal(size=(n,5))
        for j in range(5):
            x[:,j] += np.random.normal()
        pr = np.dot(x, beta)
        pr = np.array([1,0,-0.5]) + pr[:,None]
        pr = 1 / (1 + np.exp(-pr))

        z = rz*np.random.normal() +\
            np.sqrt(1-rz**2)*np.random.normal(size=n)
        u = norm.cdf(z)

        y = (u[:,None] > pr).sum(1)

        for j in range(n):
            OUT.write("%d,%d," % (i, y[j]))
            OUT.write(",".join(["%.3f" % b for b in x[j,:]]) + "\n")

    OUT.close()
Beispiel #24
0
def copula(num_samples, rho_mat, mu_mat, methods):
    """Copula procedure to generate an OTU table with corrs close to rho_mat.
    Inputs:
     num_samples - int, number of samples. 
     rho_mat - 2d arr, symmetric positive definite matrix which specifies the 
     correlation or covariation between the otu's in the table. 
     mu_mat - 1d arr w/ len(num_otus), mean of otu for multivariate random call.
     methods - list of lists w/ len(num_otus), each list has a variable number 
     of elements. the first element in each list is the 
     scipy.stats.distributions function like lognorm or beta. this is the 
     function that we draw values from for the actual otu. the remaining entries
     are the parameters for that function in order that the function requires 
     them.
    """
    num_otus = len(mu_mat)
    # draw from multivariate normal distribution with specified parameters.
    # transpose so that it remains otuXsample matrix.
    Z = multivariate_normal(mean=mu_mat, cov=rho_mat, size=num_samples).T
    # using the inverse cdf of the normal distribution find where each sample 
    # value for each otu falls in the normal cdf.
    U = norm.cdf(Z)
    # make the otu table using the methods and cdf values. ppf_args[0] is the 
    # distribution function (eg. lognorm) whose ppf function we will use
    # to transform the cdf vals into the new distribution. ppf_args[1:] is the 
    # params of the function like a, b, size, loc etc. 
    otu_table = array([ppf_args[0].ppf(otu_cdf_vals, *ppf_args[1:], 
        size=num_otus) for ppf_args, otu_cdf_vals in zip(methods, U)])
    return where(otu_table > 0, otu_table, 0)
Beispiel #25
0
def weightedUtest(g1, w1, g2, w2):
    """ Determines the confidence level of the assertion:
    'The values of g2 are higher than those of g1'.  
    (adapted from the scipy.stats version)
    
    Twist: here the elements of each group have associated weights, 
    corresponding to how often they are present (i.e. two identical entries with 
    weight w are equivalent to one entry with weight 2w).
    Reference: "Studies in Continuous Black-box Optimization", Schaul, 2011 [appendix B].
    
    TODO: make more efficient for large sets. 
    """
    from scipy.stats.distributions import norm
    import numpy

    n1 = sum(w1)
    n2 = sum(w2)
    u1 = 0.
    for x1, wx1 in zip(g1, w1):
        for x2, wx2 in zip(g2, w2):
            if x1 == x2:
                u1 += 0.5 * wx1 * wx2
            elif x1 > x2:
                u1 += wx1 * wx2
    mu = n1 * n2 / 2.
    sigu = numpy.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.)
    z = (u1 - mu) / sigu
    conf = norm.cdf(z)
    return conf 
    def testsolveIWithDispersionMatchesMorrisE0Disp(self):
        """The solveIWithDispersionDimensional method reproduces the results
        of Morris et al 2015.
        """
        time = np.linspace(0, 7, 7e4) #1000 pts per second
        time_step = time[1] - time[0]
        num_time_pts = len(time)
        dE = 0
        freq = 0
        Ru = 0
        Cdl = 0.
        Cdl1 = 0.
        Cdl2 = 0.
        Cdl3 = 0.
        EStart = -0.2
        ERev = 0.5
        temp = 293
        nu = 0.1
        area = 1
        coverage = 1e-11
        E_0BinsUnscaled = np.linspace(-17.5e-3, 17.5e-3, 15)
        #Define wE in terms of bin widths
        leftBinEnds = np.empty(15)
        leftBinEnds[1:] = np.linspace(-16.25e-3, 16.25e-3, 14)
        leftBinEnds[0] = -np.inf
        rightBinEnds = np.empty(15)
        rightBinEnds[:-1] = np.linspace(-16.25e-3, 16.25e-3, 14)
        rightBinEnds[-1] = np.inf
        wE = norm.cdf(rightBinEnds, loc=0, scale=5e-3) -\
        norm.cdf(leftBinEnds, loc=0, scale=5e-3)

        k_0Bins = {0.1 : 1.0}
        expWidth = np.array([0.124, 0.126, 0.129, 0.133, 0.138, 0.144, 0.151, 0.159, 0.167, 0.176])

        self.assertEqual(np.sum(wE), 1)

        for l,ew in zip(range(1, 11), expWidth):
            E_0Vals = l * E_0BinsUnscaled
            self.assertTrue(np.isclose(E_0Vals[-1]-E_0Vals[0], l*35.e-3))
        bins = [(E_0, 0.1, we) for E_0, we in zip(E_0Vals, wE)]

        I, amt = st.solve_reaction_disp_dim_bins(time_step, num_time_pts, dE, freq, Ru,
                    Cdl, Cdl1, Cdl2, Cdl3, EStart, ERev, temp, nu, area,
                    coverage, bins)

        width = st.half_maximum_width(I, time, nu)
        self.assertLess(abs(width - ew), 7e-4) #Rounding error + 2*step size + solution error (estimated at 1*step size)
Beispiel #27
0
    def test_scoretest(self):
        # Regression tests

        np.random.seed(6432)
        n = 200 # Must be divisible by 4
        exog = np.random.normal(size=(n, 4))
        endog = exog[:, 0] + exog[:, 1] + exog[:, 2]
        endog += 3*np.random.normal(size=n)
        group = np.kron(np.arange(n/4), np.ones(4))

        # Test under the null.
        L = np.array([[1., -1, 0, 0]])
        R = np.array([0.,])
        family = Gaussian()
        va = Independence()
        mod1 = GEE(endog, exog, group, family=family,
                  cov_struct=va, constraint=(L, R))
        rslt1 = mod1.fit()
        assert_almost_equal(mod1.score_test_results["statistic"],
                            1.08126334)
        assert_almost_equal(mod1.score_test_results["p-value"],
                            0.2984151086)

        # Test under the alternative.
        L = np.array([[1., -1, 0, 0]])
        R = np.array([1.0,])
        family = Gaussian()
        va = Independence()
        mod2 = GEE(endog, exog, group, family=family,
                   cov_struct=va, constraint=(L, R))
        rslt2 = mod2.fit()
        assert_almost_equal(mod2.score_test_results["statistic"],
                            3.491110965)
        assert_almost_equal(mod2.score_test_results["p-value"],
                            0.0616991659)

        # Compare to Wald tests
        exog = np.random.normal(size=(n, 2))
        L = np.array([[1, -1]])
        R = np.array([0.])
        f = np.r_[1, -1]
        for i in range(10):
            endog = exog[:, 0] + (0.5 + i/10.)*exog[:, 1] +\
                    np.random.normal(size=n)
            family = Gaussian()
            va = Independence()
            mod0 = GEE(endog, exog, group, family=family,
                       cov_struct=va)
            rslt0 = mod0.fit()
            family = Gaussian()
            va = Independence()
            mod1 = GEE(endog, exog, group, family=family,
                       cov_struct=va, constraint=(L, R))
            rslt1 = mod1.fit()
            se = np.sqrt(np.dot(f, np.dot(rslt0.cov_params(), f)))
            wald_z = np.dot(f, rslt0.params) / se
            wald_p = 2*norm.cdf(-np.abs(wald_z))
            score_p = mod1.score_test_results["p-value"]
            assert_array_less(np.abs(wald_p - score_p), 0.02)
Beispiel #28
0
    def test_scoretest(self):
        # Regression tests

        np.random.seed(6432)
        n = 200 # Must be divisible by 4
        exog = np.random.normal(size=(n, 4))
        endog = exog[:, 0] + exog[:, 1] + exog[:, 2]
        endog += 3*np.random.normal(size=n)
        group = np.kron(np.arange(n/4), np.ones(4))

        # Test under the null.
        L = np.array([[1., -1, 0, 0]])
        R = np.array([0.,])
        family = Gaussian()
        va = Independence()
        mod1 = GEE(endog, exog, group, family=family,
                  cov_struct=va, constraint=(L, R))
        rslt1 = mod1.fit()
        assert_almost_equal(mod1.score_test_results["statistic"],
                            1.08126334)
        assert_almost_equal(mod1.score_test_results["p-value"],
                            0.2984151086)

        # Test under the alternative.
        L = np.array([[1., -1, 0, 0]])
        R = np.array([1.0,])
        family = Gaussian()
        va = Independence()
        mod2 = GEE(endog, exog, group, family=family,
                   cov_struct=va, constraint=(L, R))
        rslt2 = mod2.fit()
        assert_almost_equal(mod2.score_test_results["statistic"],
                            3.491110965)
        assert_almost_equal(mod2.score_test_results["p-value"],
                            0.0616991659)

        # Compare to Wald tests
        exog = np.random.normal(size=(n, 2))
        L = np.array([[1, -1]])
        R = np.array([0.])
        f = np.r_[1, -1]
        for i in range(10):
            endog = exog[:, 0] + (0.5 + i/10.)*exog[:, 1] +\
                    np.random.normal(size=n)
            family = Gaussian()
            va = Independence()
            mod0 = GEE(endog, exog, group, family=family,
                       cov_struct=va)
            rslt0 = mod0.fit()
            family = Gaussian()
            va = Independence()
            mod1 = GEE(endog, exog, group, family=family,
                       cov_struct=va, constraint=(L, R))
            rslt1 = mod1.fit()
            se = np.sqrt(np.dot(f, np.dot(rslt0.cov_params(), f)))
            wald_z = np.dot(f, rslt0.params) / se
            wald_p = 2*norm.cdf(-np.abs(wald_z))
            score_p = mod1.score_test_results["p-value"]
            assert_array_less(np.abs(wald_p - score_p), 0.02)
def calculateProbability(time, avg, sd):
    if sd == pd.to_timedelta(0):
        print(
            "Standard deviation is 0, not enough data points, returning p = 0")
        p = 0
    else:
        z = (time - avg) / sd
        p = norm.cdf(z)
    return p
Beispiel #30
0
def compare_medians_ms(group_1, group_2, axis=None):
    """
    Compares the medians from two independent groups along the given axis.

    The comparison is performed using the McKean-Schrader estimate of the
    standard error of the medians.

    Parameters
    ----------
    group_1 : array_like
        First dataset.  Has to be of size >=7.
    group_2 : array_like
        Second dataset.  Has to be of size >=7.
    axis : int, optional
        Axis along which the medians are estimated. If None, the arrays are
        flattened.  If `axis` is not None, then `group_1` and `group_2`
        should have the same shape.

    Returns
    -------
    compare_medians_ms : {float, ndarray}
        If `axis` is None, then returns a float, otherwise returns a 1-D
        ndarray of floats with a length equal to the length of `group_1`
        along `axis`.

    Examples
    --------

    >>> from scipy import stats
    >>> a = [1, 2, 3, 4, 5, 6, 7]
    >>> b = [8, 9, 10, 11, 12, 13, 14]
    >>> stats.mstats.compare_medians_ms(a, b, axis=None)
    1.0693225866553746e-05

    The function is vectorized to compute along a given axis.

    >>> import numpy as np
    >>> rng = np.random.default_rng()
    >>> x = rng.random(size=(3, 7))
    >>> y = rng.random(size=(3, 8))
    >>> stats.mstats.compare_medians_ms(x, y, axis=1)
    array([0.36908985, 0.36092538, 0.2765313 ])

    References
    ----------
    .. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods
       for studentizing the sample median." Communications in
       Statistics-Simulation and Computation 13.6 (1984): 751-773.

    """
    (med_1, med_2) = (ma.median(group_1,
                                axis=axis), ma.median(group_2, axis=axis))
    (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
                      mstats.stde_median(group_2, axis=axis))
    W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
    return 1 - norm.cdf(W)
Beispiel #31
0
def get_candidate_window2(read, x, y, repx, repy, threshold):
    # using PHI = 1e6 to prescreen the genome 
    PHI = 1e6
    GAMMA_HAT = 1.0
    tau = numpy.sqrt( y*( repx*x*(PHI+y) + repy*y*(PHI+x))/repx/repy/PHI/x**3)
    gamma = y/x
    z = (numpy.log(gamma)-numpy.log(GAMMA_HAT))*gamma/tau
    pvalue = norm.cdf(-z)
    pre_idx_list = numpy.where(pvalue[10:-10]<threshold)[0]+10
    return numpy.array(pre_idx_list)
Beispiel #32
0
def logrank_power(n, surv1, surv2, alpha=0.05):
    d = n * (2 - surv1 - surv2)
    if surv1 == 1 or surv2 == 1:
        return 0
    elif surv1 == 0 or surv2 == 0:
        return -1
    phi = log(surv1) / log(surv2) if surv1 < surv2 else log(surv2) / log(surv1)
    z_a = norm.ppf(1 - alpha)
    z_1_beta = sqrt(d * (1 - phi) * (1 - phi) / (1 + phi) / (1 + phi)) - z_a
    return norm.cdf(z_1_beta)
    def loss_function_simple(abg, visualize, fig_tag = ''):
        error = .0;
        if visualize:
            figure()
                        
        for (phi_m, phi_idx) in zip(phis, xrange(N_phi)):
            Is = bins[phi_m]['Is']
            uniqueIs = bins[phi_m]['unique_Is']
             
            a,b,g = abg[0], abg[1], abg[2]
            movingThreshold = getMovingThreshold(a,g, phi_m,binnedTrain.theta)
            
            LHS_numerator = movingThreshold(uniqueIs[1:]) *sqrt(2.)
            LHS_denominator = b * sqrt(1 - exp(-2*uniqueIs[1:]))
            LHS = 1 -  norm.cdf(LHS_numerator / LHS_denominator)
            
            RHS = zeros_like(LHS)
            N  = len(Is)
            for rhs_idx in xrange(1,len(uniqueIs)):
                t = uniqueIs[rhs_idx]
                lIs = Is[Is<t]
                taus = t - lIs;
                
                numerator = (movingThreshold(t) - movingThreshold(lIs)* exp(-taus)) * sqrt(2.)
                denominator = b *  sqrt(1. - exp(-2*taus))
                RHS[rhs_idx-1] = sum(1. - norm.cdf(numerator/denominator)) / N
            
            weight = len(Is)
            lerror = dot((LHS - RHS)**2 , diff(uniqueIs)) * weight;
            error += lerror
        
            if visualize:
                    subplot(ceil(len(phis)/2),2, phi_idx+1);hold(True)
                    ts = uniqueIs[1:]; 
                    plot(ts, LHS, 'b');
                    plot(ts, RHS, 'rx');
#                    annotate('$\phi$ = %.2g'%(phi_m), ((min(ts), max(LHS)/2.)), ) 
                    annotate('lerror = %.3g'%lerror,((min(ts), max(LHS)/2.)), ) 
        if visualize:
            subplot(ceil(len(phis)/2),2, 1);
            title(fig_tag)          
        return error
 def RHS(ts):
     if False == iterable(ts):
         ts =  [ts] 
     rhs = empty_like(ts)
     for t, t_indx in zip(ts, xrange(size(ts))):
         lIs = Is[Is<t];
         taus = t - lIs;
         numerator = (movingThreshold(t) - movingThreshold(lIs)* exp(-taus)) * sqrt(2.)
         denominator = b *  sqrt(1. - exp(-2*taus))
         rhs[t_indx] = sum(1. - norm.cdf(numerator/denominator)) / N
     return rhs
Beispiel #35
0
def get_candidate_window2(x, y, repx, repy, threshold):
    # using PHI = 1e6 to prescreen the genome
    PHI = 1e6
    GAMMA_HAT = 1.0
    tau = numpy.sqrt(y * (repx * x * (PHI + y) + repy * y * (PHI + x)) / repx /
                     repy / PHI / x**3)
    gamma = y / x
    z = (numpy.log(gamma) - numpy.log(GAMMA_HAT)) * gamma / tau
    pvalue = norm.cdf(-z)
    pre_idx_list = numpy.where(pvalue[10:-10] < threshold)[0] + 10
    return numpy.array(pre_idx_list)
Beispiel #36
0
def per_chr_nbtest(read_array, chr, swap, threshold, peaktype, difftest,
                   start1, end1, start2, end2, test_rep, control_rep):
    t1 = time.time()
    sig_peaks_list = []
    y_bar_array = numpy.mean(read_array[:, start1:end1], 1)
    x_bar_array = numpy.mean(read_array[:, start2:end2], 1)
    if swap:  #swap the chip and control reads.
        x_bar_array, y_bar_array = y_bar_array, x_bar_array
    cand_index = get_candidate_window2(x_bar_array, y_bar_array, control_rep,
                                       test_rep, threshold)
    debug("There are %d candidate windows for %s (PID:%d)", len(cand_index),
          chr, os.getpid())
    if not swap:
        disp_list = numpy.array([
            estimate_area_dispersion_factor(read_array, test_rep, control_rep,
                                            idx, peaktype, difftest)
            for idx in cand_index
        ])
    else:
        disp_list = numpy.array([
            estimate_area_dispersion_factor(read_array, control_rep, test_rep,
                                            idx, peaktype, difftest)
            for idx in cand_index
        ])
    #debug("finished estimating dispersion for %s", chr)

# return []
    cand_x_bar_array = x_bar_array[cand_index]
    cand_y_bar_array = y_bar_array[cand_index]
    gamma_array = cand_y_bar_array / cand_x_bar_array
    tau_hat_array = numpy.sqrt(
        cand_y_bar_array *
        ((control_rep * cand_x_bar_array * (disp_list + cand_y_bar_array)) +
         (test_rep * cand_y_bar_array * (disp_list + cand_x_bar_array))) /
        (test_rep * control_rep * disp_list * (cand_x_bar_array**3)))

    gamma_hat = 1.0  #Null hypothesis
    z_score_array = ((numpy.log(gamma_array) - numpy.log(gamma_hat)) *
                     gamma_array / tau_hat_array)
    pval_array = norm.cdf(-z_score_array)
    test_index = numpy.where(pval_array < threshold)
    test_index = test_index[0]
    sig_index = cand_index[test_index]
    sig_pval = pval_array[test_index]
    sig_group1_count = cand_y_bar_array[test_index]
    sig_group2_count = cand_x_bar_array[test_index]
    #sig_disp = disp_list[test_index]
    for i, a in enumerate(test_index):
        sig_peaks_list.append(
            Peak(chr, sig_index[i], sig_group1_count[i], sig_group2_count[i],
                 sig_pval[i], 0))
    t2 = time.time()
    debug("Analysis finished for %s, used %f sec CPU time", chr, t2 - t1)
    return sig_peaks_list
Beispiel #37
0
def tst_importance_sampl():
    """
	Reducing variance using importance sampling.
	"""
    print("Probability that std normal will be greater than 2 is:" +
          str((1 - norm.cdf(2, 0, 1))))
    print("What we get from direct simulation:" +
          str(sum(np.random.normal(0, 1, size=10000) > 2) / 10000))
    summ = 0
    for x in np.random.normal(2, 1, size=10000):
        summ += (x > 2) * norm.pdf(x, 0, 1) / norm.pdf(x, 2, 1)
    print("With importance sampling:" + str(summ / 10000))
 def RHS(ts):
     if False == iterable(ts):
         ts =  array([ts])
     lIs = tile(Is,  len(ts) ).reshape((len(ts), len(Is))).transpose()
     lts = tile(ts, (len(Is),1 ) )
     mask = lIs < lts
     taus = (lts - lIs); #*mask
     #NOTE BELOW WE use abs(taus) since for non-positive taus we will mask away anyway:
     numerator = (movingThreshold(lts) - movingThreshold(lIs)* exp(-abs(taus))) * sqrt(2.)
     denominator = b *  sqrt(1. - exp(-2*abs(taus)))
     
     rhs = sum( (1. - norm.cdf(numerator/denominator))*mask, axis=0) / N_Is
     return rhs
Beispiel #39
0
def filterExptsByPseudoCountDistr( ddict ):
    
    # remove experiments where the pseudocount is high
    # relative to the other pseudocounts
    pseudodict     = { k : ddict[k]['PSEUDO'] for k in ddict }
    pskeys         = list(pseudodict.keys())
    pslogvals      = np.log10(list(pseudodict.values()))
    pslogmad       = mad(pslogvals) ; 
    pslogmedian    = np.percentile(pslogvals,50)
    pslvps_hi      = 1-norm.cdf((pslogvals-pslogmedian)/pslogmad)
    rejected_ds_hi = multipletests( pslvps_hi, alpha=0.05 )[0]

    # return data in a dictionary
    filteredExpts  = {  pskeys[i] : rejected_ds_hi[i] for i in range(len(pskeys))}
    return filteredExpts
def dosim(hyp, cov_struct=None, mcrep=500):

    # Storage for the simulation results
    scales = [[], []]

    # P-values from the score test
    pv = []

    # Monte Carlo loop
    for k in range(mcrep):

        # Generate random "probability points" u  that are uniformly
        # distributed, and correlated within clusters
        z = np.random.normal(size=n)
        u = np.random.normal(size=n // m)
        u = np.kron(u, np.ones(m))
        z = r * z + np.sqrt(1 - r**2) * u
        u = norm.cdf(z)

        # Generate the observed responses
        y = negbinom(u, mu=mu[hyp], scale=scale)

        # Fit the null model
        m0 = sm.GEE(y,
                    x0,
                    groups=grp,
                    cov_struct=cov_struct,
                    family=sm.families.Poisson())
        r0 = m0.fit(scale='X2')
        scales[0].append(r0.scale)

        # Fit the alternative model
        m1 = sm.GEE(y,
                    x,
                    groups=grp,
                    cov_struct=cov_struct,
                    family=sm.families.Poisson())
        r1 = m1.fit(scale='X2')
        scales[1].append(r1.scale)

        # Carry out the score test
        st = m1.compare_score_test(r0)
        pv.append(st["p-value"])

    pv = np.asarray(pv)
    rslt = [np.mean(pv), np.mean(pv < 0.1)]

    return rslt, scales
def estimate_params_for_normal(x, low_bound , mu_initial, sigma_initial):
	"""
		Takes a vector x of truncated data with a known lower
		truncation bound and estimates the parameters of the 
		fit of an untruncated normal distribution.
		code from Chris Fonnesbeck's Python data analysis tutorial on Sense
		https://sense.io/prometheus2305/data-analysis-in-python/files/Statistical%20Data%20Modeling.py
	"""


	# normalize vector
	mu_initial = float(mu_initial)
	sigma_initial = float(sigma_initial)
	#x = np.random.normal(size=10000,loc=2000,scale= 2000)

	x = map(lambda y: (y-mu_initial )/sigma_initial ,x)
	a =  (low_bound - mu_initial)/sigma_initial # normalize lower bound
	

	#_ = plt.hist(x, bins=100)
	#plt.show()
	#plt.close()

	# We can construct a log likelihood for this function using the conditional
	# form	
	trunc_norm = lambda theta, a, x: -(np.log(norm.pdf(x, theta[0], theta[1])) - 
	                                      np.log(1 - norm.cdf(a, theta[0], theta[1]))).sum()

	# For this example, we will use another optimization algorithm, the
	# **Nelder-Mead simplex algorithm**. It has a couple of advantages: 
	# 
	# - it does not require derivatives
	# - it can optimize (minimize) a vector of parameters
	# 
	# SciPy implements this algorithm in its `fmin` function:

	# we have normalized data, given that the loer truncation point a
	# is pretty far out in the tail - the standard normal parameters are
	# a first good guess, i.e. 0,1
	initial_guess = np.array([0,1]) 
	sol = fmin(trunc_norm, initial_guess , args=(a, x))
	print sol
	mean_normalized,stddev_normalized = sol[0],sol[1]
	mean_est =( 1 + mean_normalized ) * mu_initial
	stddev_est = stddev_normalized * sigma_initial
	print mean_est,stddev_est
	return mean_est,stddev_est
Beispiel #42
0
def estimate_params_for_normal(x, low_bound, mu_initial, sigma_initial):
    """
		Takes a vector x of truncated data with a known lower
		truncation bound and estimates the parameters of the 
		fit of an untruncated normal distribution.
		code from Chris Fonnesbeck's Python data analysis tutorial on Sense
		https://sense.io/prometheus2305/data-analysis-in-python/files/Statistical%20Data%20Modeling.py
	"""

    # normalize vector
    mu_initial = float(mu_initial)
    sigma_initial = float(sigma_initial)
    #x = np.random.normal(size=10000,loc=2000,scale= 2000)

    x = map(lambda y: (y - mu_initial) / sigma_initial, x)
    a = (low_bound - mu_initial) / sigma_initial  # normalize lower bound

    #_ = plt.hist(x, bins=100)
    #plt.show()
    #plt.close()

    # We can construct a log likelihood for this function using the conditional
    # form
    trunc_norm = lambda theta, a, x: -(np.log(norm.pdf(x, theta[0], theta[
        1])) - np.log(1 - norm.cdf(a, theta[0], theta[1]))).sum()

    # For this example, we will use another optimization algorithm, the
    # **Nelder-Mead simplex algorithm**. It has a couple of advantages:
    #
    # - it does not require derivatives
    # - it can optimize (minimize) a vector of parameters
    #
    # SciPy implements this algorithm in its `fmin` function:

    # we have normalized data, given that the loer truncation point a
    # is pretty far out in the tail - the standard normal parameters are
    # a first good guess, i.e. 0,1
    initial_guess = np.array([0, 1])
    sol = fmin(trunc_norm, initial_guess, args=(a, x))
    print sol
    mean_normalized, stddev_normalized = sol[0], sol[1]
    mean_est = (1 + mean_normalized) * mu_initial
    stddev_est = stddev_normalized * sigma_initial
    print mean_est, stddev_est
    return mean_est, stddev_est
Beispiel #43
0
def per_chr_nbtest(read_array, chr, swap,threshold, peaktype,difftest, start1,end1,start2,end2,test_rep,control_rep):
    t1 = time.time()
    sig_peaks_list = []
    y_bar_array = numpy.mean(read_array[:, start1:end1], 1)
    x_bar_array = numpy.mean(read_array[:, start2:end2], 1)
    if swap: #swap the chip and control reads.
        x_bar_array, y_bar_array = y_bar_array, x_bar_array
    cand_index = get_candidate_window2( x_bar_array,
                    y_bar_array, control_rep, test_rep, threshold)
    debug("There are %d candidate windows for %s (PID:%d)", len(cand_index), chr, os.getpid())
    if not swap:
        disp_list = numpy.array([estimate_area_dispersion_factor(read_array,
                test_rep, control_rep, idx, peaktype, difftest)
                for idx in cand_index])
    else:
        disp_list = numpy.array([estimate_area_dispersion_factor(read_array,
                control_rep, test_rep, idx, peaktype, difftest)
                for idx in cand_index])
    #debug("finished estimating dispersion for %s", chr)
   # return []
    cand_x_bar_array = x_bar_array[cand_index]
    cand_y_bar_array = y_bar_array[cand_index]
    gamma_array = cand_y_bar_array / cand_x_bar_array
    tau_hat_array = numpy.sqrt(cand_y_bar_array*
            ((control_rep*cand_x_bar_array*(disp_list+cand_y_bar_array)) +
            (test_rep*cand_y_bar_array*(disp_list+cand_x_bar_array)))/
            (test_rep*control_rep*disp_list*(cand_x_bar_array**3)))

    gamma_hat = 1.0 #Null hypothesis
    z_score_array = ((numpy.log(gamma_array)-numpy.log(gamma_hat))*
            gamma_array/tau_hat_array)
    pval_array = norm.cdf(-z_score_array)
    test_index = numpy.where(pval_array<threshold)
    test_index = test_index[0]
    sig_index = cand_index[test_index]
    sig_pval = pval_array[test_index]
    sig_group1_count = cand_y_bar_array[test_index]
    sig_group2_count = cand_x_bar_array[test_index]
    #sig_disp = disp_list[test_index]
    for i, a in enumerate(test_index):
        sig_peaks_list.append(Peak(chr, sig_index[i], sig_group1_count[i], sig_group2_count[i], sig_pval[i], 0))
    t2 = time.time()
    debug ("Analysis finished for %s, used %f sec CPU time", chr, t2-t1)
    return sig_peaks_list
Beispiel #44
0
def compare_medians_ms(group_1, group_2, axis=None):
    """Compares the medians from two independent groups along the given axis.
    Returns an array of p values.
    The comparison is performed using the McKean-Schrader estimate of the standard
    error of the medians.    
    
:Inputs:
    group_1 : sequence
        First dataset.
    group_2 : sequence
        Second dataset.
    axis : integer *[None]*
        Axis along which the medians are estimated. If None, the arrays are flattened.
    """
    (med_1, med_2) = (mmedian(group_1, axis=axis), mmedian(group_2, axis=axis))
    (std_1, std_2) = (stde_median(group_1, axis=axis), 
                      stde_median(group_2, axis=axis)) 
    W = abs(med_1 - med_2) / sqrt(std_1**2 + std_2**2)
    return 1 - norm.cdf(W)
Beispiel #45
0
def param_table(beta, sigma, names):
    # standard errors
    stderr = np.sqrt(sigma.diagonal())

    # confidence interval
    low95 = beta - z95*stderr
    high95 = beta + z95*stderr

    # p-value
    zscore = beta/stderr
    pvalue = 2*(1-norm.cdf(np.abs(zscore)))

    # return all
    return pd.DataFrame({
        'coeff': beta,
        'stderr': stderr,
        'low95': low95,
        'high95': high95,
        'pvalue': pvalue
    }, index=names)
Beispiel #46
0
def plotzfe():
    """ Test Simulation 1-2
        Plot theoretical vs. simulated results for zero forcing equalizer."""
    nsample = 10**5
    snrlst = range(0,19)

    # Calculate the theoretical values
    pe = []
    coeff = zero_forcing_coeff(tap2, 41)
    for snr in snrlst:
        delta_square = 10**(-snr/10.)*sum(coeff**2)
        pe.append(1-norm.cdf(sqrt((1-0.41)**2/delta_square)))
    
    plt.semilogy(snrlst,pe,snrlst,equalizer(2,41,snrlst,nsample,'zfir'),"-.")
    plt.legend(("Theoretical curve", "41 taps simulation"), loc='lower left')
    plt.title("Theoretical vs. Simulated performances")
    plt.xlabel("SNR (dB)")
    plt.ylabel("SER (dB)")
    plt.grid(True, which='both')
    plt.show()
Beispiel #47
0
def plotdfe2():
    nsample = 10**5
    snrlst = range(0,19)
    nzf = 41
    tap = tap2
    
    pe = []
    for snr in snrlst:
        cj,_ = dfe_coeff(tap, nzf, 41, snr)
        f = [0]*(len(cj)-len(tap)) + list(tap)[::-1]
        jmin = 1-sum(np.array(f)*cj)
        gamma = (1-jmin)/jmin
        pe.append(1-norm.cdf(sqrt(gamma)))
    plt.semilogy(snrlst,pe,snrlst, equalizer(2, 41, snrlst, nsample, 'dfe'), "-.")
    plt.legend(("Theoretical curve", "Simulated curve"), loc='lower left')
    plt.title("Theoretical vs. Simulated performances for Channel 1")
    plt.xlabel("SNR (dB)")
    plt.ylabel("SER (dB)")
    plt.grid(True, which='both')
    plt.show()
Beispiel #48
0
def kurtosistest(a, axis=-1):
    """Tests whether a dataset has normal kurtosis

    That is, test whether kurtosis=3(n-1)/(n+1). Valid only for n>20.  Axis
    can equal None (ravel array first), an integer (the axis over which to
    operate), or a sequence (operate over multiple axes).

    NOTE: This function is mostly copied from scipy.stats.stats, but
    corrects for a major bug: the pvalue returned by SciPy is not valid
    when the kurtosis is negative! The return values also are slightly
    different: the kurtosis is actually returned will the z-score is not.

    Returns: kurtosis and 2-tail z-probability.
    """
    a, axis = _chk_asarray(a, axis)
    n = float(a.shape[axis])
    if n < 20:
        print "kurtosistest only valid for n>=20 ... continuing anyway, n=", n
    kurt = kurtosis(a, axis)
    E = 3.0 * (n - 1) / (n + 1)
    varkurt = 24.0 * n * (n - 2) * (n - 3) / ((n + 1) * (n + 1) * (n + 3) *
                                              (n + 5))
    x = (kurt - E) / sqrt(varkurt)
    sqrtbeta1 = 6.0 * (n * n - 5 * n + 2) / ((n + 7) * (n + 9)) * sqrt(
        (6.0 * (n + 3) * (n + 5)) / (n * (n - 2) * (n - 3)))
    A = 6.0 + 8.0 / sqrtbeta1 * (2.0 / sqrtbeta1 + sqrt(1 + 4.0 /
                                                        (sqrtbeta1**2)))
    term1 = 1 - 2 / (9.0 * A)
    denom = 1 + x * sqrt(2 / (A - 4.0))
    denom = where(less(denom, 0), 99, denom)
    term2 = where(equal(denom, 0), term1, power((1 - 2.0 / A) / denom,
                                                1 / 3.0))
    Z = (term1 - term2) / sqrt(2 / (9.0 * A))
    Z = where(equal(denom, 99), 0, Z)

    # The two-tailed p-value is twice the prob that value of a std normal r.v.
    # turns out to be greater than the (absolute) value of Z
    pvalue = 2 * (1 - norm.cdf(abs(Z)))
    assert pvalue >= 0.0 and pvalue <= 1.0
    return kurt, pvalue
Beispiel #49
0
def autocorrelation(series, k=1, biased=True):
    """Returns autocorrelation of order 'k' and corresponding two-tailed pvalue.

    (Inspired by CLM pp.45-47)

    @param series: The series on which to compute autocorrelation
    @param k:      The order to which compute autocorrelation
    @param biased: If False, rho_k will be corrected according to Fuller (1976)

    @return: rho_k, pvalue
    """
    T = len(series)
    mu = mean(series)
    sigma = var(series)

    # Centered observations
    obs = series - mu
    lagged = lag(obs, k)
    truncated = obs[:-k]
    assert len(lagged) == len(truncated)

    # Multiplied by 'T' for numerical stability
    gamma_k = T * add.reduce(truncated * lagged)  # Numerator
    gamma_0 = T * add.reduce(obs * obs)  # Denominator
    rho_k = (gamma_k / gamma_0)
    if rho_k > 1.0: rho_k = 1.0  # Correct for numerical errors

    # The standard normal random variable
    Z = sqrt(T) * rho_k

    # Bias correction?
    if not biased:
        rho_k += (1 - rho_k**2) * (T - k) / (T - 1)**2
        Z = rho_k * T / sqrt(T - k)

    # The two-tailed p-value is twice the prob that value of a std normal r.v.
    # turns out to be greater than the (absolute) value of Z
    pvalue = 2 * (1 - norm.cdf(abs(Z)))
    assert pvalue >= 0.0 and pvalue <= 1.0
    return rho_k, pvalue
Beispiel #50
0
def autocorrelation(series, k=1, biased=True):
    """Returns autocorrelation of order 'k' and corresponding two-tailed pvalue.

    (Inspired by CLM pp.45-47)

    @param series: The series on which to compute autocorrelation
    @param k:      The order to which compute autocorrelation
    @param biased: If False, rho_k will be corrected according to Fuller (1976)

    @return: rho_k, pvalue
    """
    T = len(series)
    mu = mean(series)
    sigma = var(series)

    # Centered observations
    obs = series-mu    
    lagged = lag(obs, k) 
    truncated = obs[:-k]
    assert len(lagged) == len(truncated)

    # Multiplied by 'T' for numerical stability
    gamma_k = T*add.reduce(truncated*lagged)  # Numerator
    gamma_0 = T*add.reduce(obs*obs)           # Denominator
    rho_k   = (gamma_k / gamma_0)
    if rho_k > 1.0: rho_k = 1.0   # Correct for numerical errors

    # The standard normal random variable
    Z = sqrt(T)*rho_k
    
    # Bias correction?
    if not biased:
        rho_k += (1 - rho_k**2) * (T-k)/(T-1)**2
        Z = rho_k * T/sqrt(T-k)

    # The two-tailed p-value is twice the prob that value of a std normal r.v.
    # turns out to be greater than the (absolute) value of Z
    pvalue = 2*( 1 - norm.cdf(abs(Z)) )
    assert pvalue >= 0.0 and pvalue <= 1.0
    return rho_k, pvalue
def generate_logistic():

    # Number of clusters
    nclust = 100

    # Regression coefficients
    beta = np.array([1, -2, 1], dtype=np.float64)

    ## Covariate correlations
    r = 0.4

    ## Cluster effects of covariates
    rx = 0.5

    ## Within-cluster outcome dependence
    re = 0.3

    p = len(beta)

    OUT = open("gee_logistic_1.csv", "w")

    for i in range(nclust):

        n = np.random.randint(3, 6)  # Cluster size

        x = np.random.normal(size=(n, p))
        x = rx * np.random.normal() + np.sqrt(1 - rx**2) * x
        x[:, 2] = r * x[:, 1] + np.sqrt(1 - r**2) * x[:, 2]
        pr = 1 / (1 + np.exp(-np.dot(x, beta)))
        z = re*np.random.normal() +\
            np.sqrt(1-re**2)*np.random.normal(size=n)
        u = norm.cdf(z)
        y = 1 * (u < pr)

        for j in range(n):
            OUT.write("%d,%d," % (i, y[j]))
            OUT.write(",".join(["%.3f" % b for b in x[j, :]]) + "\n")

    OUT.close()
def generate_logistic():

    # Number of clusters
    nclust = 100

    # Regression coefficients
    beta = np.array([1,-2,1], dtype=np.float64)

    ## Covariate correlations
    r = 0.4

    ## Cluster effects of covariates
    rx = 0.5

    ## Within-cluster outcome dependence
    re = 0.3

    p = len(beta)

    OUT = open("gee_logistic_1.csv", "w")

    for i in range(nclust):

        n = np.random.randint(3, 6) # Cluster size

        x = np.random.normal(size=(n,p))
        x = rx*np.random.normal() + np.sqrt(1-rx**2)*x
        x[:,2] = r*x[:,1] + np.sqrt(1-r**2)*x[:,2]
        pr = 1/(1+np.exp(-np.dot(x, beta)))
        z = re*np.random.normal() +\
            np.sqrt(1-re**2)*np.random.normal(size=n)
        u = norm.cdf(z)
        y = 1*(u < pr)

        for j in range(n):
            OUT.write("%d,%d," % (i, y[j]))
            OUT.write(",".join(["%.3f" % b for b in x[j,:]]) + "\n")

    OUT.close()
Beispiel #53
0
def kurtosistest(a,axis=-1):
    """Tests whether a dataset has normal kurtosis

    That is, test whether kurtosis=3(n-1)/(n+1). Valid only for n>20.  Axis
    can equal None (ravel array first), an integer (the axis over which to
    operate), or a sequence (operate over multiple axes).

    NOTE: This function is mostly copied from scipy.stats.stats, but
    corrects for a major bug: the pvalue returned by SciPy is not valid
    when the kurtosis is negative! The return values also are slightly
    different: the kurtosis is actually returned will the z-score is not.

    Returns: kurtosis and 2-tail z-probability.
    """
    a, axis = _chk_asarray(a, axis)
    n = float(a.shape[axis])
    if n<20:
        print "kurtosistest only valid for n>=20 ... continuing anyway, n=",n
    kurt = kurtosis(a,axis)
    E = 3.0*(n-1) /(n+1)
    varkurt = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1)*(n+3)*(n+5))
    x = (kurt-E)/sqrt(varkurt)
    sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * sqrt((6.0*(n+3)*(n+5))/
                                                       (n*(n-2)*(n-3)))
    A = 6.0 + 8.0/sqrtbeta1 *(2.0/sqrtbeta1 + sqrt(1+4.0/(sqrtbeta1**2)))
    term1 = 1 -2/(9.0*A)
    denom = 1 +x*sqrt(2/(A-4.0))
    denom = where(less(denom,0), 99, denom)
    term2 = where(equal(denom,0), term1, power((1-2.0/A)/denom,1/3.0))
    Z = ( term1 - term2 ) / sqrt(2/(9.0*A))
    Z = where(equal(denom,99), 0, Z)
    
    # The two-tailed p-value is twice the prob that value of a std normal r.v.
    # turns out to be greater than the (absolute) value of Z
    pvalue = 2*( 1 - norm.cdf(abs(Z)) )
    assert pvalue >= 0.0 and pvalue <= 1.0
    return kurt, pvalue    
Beispiel #54
0
def variance_ratio(series, q, rw_hypothesis=1):
    """Returns 'VR(q)' and the corresponding pvalue.

    VR(q) here refers to the variance ratio suggested in Campbell, Lo and
    MacKinlay (1997), pp.49-57.

    @param series:        The series on which to compute VR.
    @param q:             Number of periods of the long-horizon return in VR
    @param rw_hypothesis: Which null hypothesis to test against. The value
      must be in [0, 1, 3]. Zero is a special value under which no pvalue is
      reported. One and three lead to the use of RW1 and RW3.

    @return: VR_q [, pvalue -- if rw_hypothesis!=0 ]
    """
    assert q > 1
    T = len(series)
    qf = float(q)
    VR_q = 1.0
    for k in range(1, q):  # Will sum till q-1 as desired
        VR_q += 2.0 * (1.0 -
                       (k / q)) * autocorrelation(series, k, biased=True)[0]

    # Zero is a special value under which no pvalue is reported
    if rw_hypothesis == 0:
        return VR_q

    # TBReplaced by n*q in pp.52-55's version...
    nq = float(T - 1)
    Z = sqrt(nq) * (VR_q - 1.0)

    if rw_hypothesis == 1:
        Z /= sqrt(2.0 * (2 * q - 1) * (q - 1) / (3.0 * q))
        return VR_q, 2 * (1 - norm.cdf(abs(Z)))

    if rw_hypothesis == 3:
        raise NotImplementedError
    raise ValueError("'rw_hypothesis' must be in [0,1,3].")
def generate_nominal():

    ## Regression coefficients
    beta1 = np.r_[0.5, 0.5]
    beta2 = np.r_[-1, -0.5]
    p = len(beta1)

    rz = 0.5

    OUT = open("gee_nominal_1.csv", "w")

    for i in range(200):

        n = np.random.randint(3, 6) # Cluster size

        x = np.random.normal(size=(n,p))
        x[:,0] = 1
        for j in range(1,x.shape[1]):
            x[:,j] += np.random.normal()
        pr1 = np.exp(np.dot(x, beta1))[:,None]
        pr2 = np.exp(np.dot(x, beta2))[:,None]
        den = 1 + pr1 + pr2
        pr = np.hstack((pr1/den, pr2/den, 1/den))
        cpr = np.cumsum(pr, 1)

        z = rz*np.random.normal() +\
            np.sqrt(1-rz**2)*np.random.normal(size=n)
        u = norm.cdf(z)

        y = (u[:,None] > cpr).sum(1)

        for j in range(n):
            OUT.write("%d,%d," % (i, y[j]))
            OUT.write(",".join(["%.3f" % b for b in x[j,:]]) + "\n")

    OUT.close()
Beispiel #56
0
def variance_ratio(series, q, rw_hypothesis=1):
    """Returns 'VR(q)' and the corresponding pvalue.

    VR(q) here refers to the variance ratio suggested in Campbell, Lo and
    MacKinlay (1997), pp.49-57.

    @param series:        The series on which to compute VR.
    @param q:             Number of periods of the long-horizon return in VR
    @param rw_hypothesis: Which null hypothesis to test against. The value
      must be in [0, 1, 3]. Zero is a special value under which no pvalue is
      reported. One and three lead to the use of RW1 and RW3.

    @return: VR_q [, pvalue -- if rw_hypothesis!=0 ]
    """
    assert q > 1
    T = len(series)
    qf = float(q)
    VR_q = 1.0
    for k in range(1, q): # Will sum till q-1 as desired
        VR_q += 2.0 * (1.0 - (k/q)) * autocorrelation(series, k, biased=True)[0]   

    # Zero is a special value under which no pvalue is reported
    if rw_hypothesis==0:
        return VR_q

    # TBReplaced by n*q in pp.52-55's version...        
    nq = float(T-1)
    Z = sqrt(nq) * (VR_q - 1.0)

    if rw_hypothesis==1:
        Z /= sqrt(2.0*(2*q - 1)*(q-1) / (3.0*q))
        return VR_q, 2*( 1 - norm.cdf(abs(Z)) )

    if rw_hypothesis==3:
        raise NotImplementedError
    raise ValueError("'rw_hypothesis' must be in [0,1,3].")