Ejemplo n.º 1
0
    def ppf(self, x):
        """
        Computes the percent point function of the distribution at the point(s)
        x. It is defined as the inverse of the CDF. y = ppf(x) can be
        interpreted as the argument y for which the value of the cdf(x) is equal
        to y. Essentially that means the random varable y is the place on the
        distribution the CDF evaluates to x.

        Parameters
        ----------
        x: array, dtype=float, shape=(m x n), bounds=(0,1)
            The value(s) at which the user would like the ppf evaluated.
            If an array is passed in, the ppf is evaluated at every point
            in the array and an array of the same size is returned.

        Returns
        -------
        ppf: array, dtype=float, shape=(m x n)
            The ppf at each point in x.
        """
        if (x <=0).any() or (x >=1).any():
            raise ValueError('all values in x must be between 0 and 1, \
                             exclusive')
        ppf = chdtri(self.k, 1. - x)

        return ppf
Ejemplo n.º 2
0
    def ppf(self, x):
        """
        Computes the percent point function of the distribution at the point(s)
        x. It is defined as the inverse of the CDF. y = ppf(x) can be
        interpreted as the argument y for which the value of the cdf(x) is equal
        to y. Essentially that means the random varable y is the place on the
        distribution the CDF evaluates to x.

        Parameters
        ----------
        x: array, dtype=float, shape=(m x n), bounds=(0,1)
            The value(s) at which the user would like the ppf evaluated.
            If an array is passed in, the ppf is evaluated at every point
            in the array and an array of the same size is returned.

        Returns
        -------
        ppf: array, dtype=float, shape=(m x n)
            The ppf at each point in x.
        """
        if (x <= 0).any() or (x >= 1).any():
            raise ValueError('all values in x must be between 0 and 1, \
                             exclusive')
        ppf = chdtri(self.k, 1. - x)

        return ppf
Ejemplo n.º 3
0
def chi_square(alpha):

    samples = [
        np.random.exponential(scale=1, size=n),
        np.random.exponential(scale=1, size=m),
        np.random.exponential(scale=1 / alpha, size=k)
    ]

    n_total = n + m + k

    r = int((m + k) / 10)

    split_list = np.linspace(0, 10, r)
    v = np.zeros((r - 1, len(samples)))

    for idx_split in range(1, r):
        for idx_dist, dist in enumerate(samples):
            mask = np.logical_and(dist > split_list[idx_split - 1], dist < split_list[idx_split])
            v[idx_split - 1, idx_dist] = dist[mask].shape[0]
    delta = 0
    for i in range(r - 1):
        for j in range(len(samples)):
            if v[i, :].sum(dtype=np.int64) * v[:, j].sum(dtype=np.int64) != 0:
                delta += ((v[i, j] - (v[i, :].sum(dtype=np.int64) * v[:, j].sum(dtype=np.int64)) / n_total) ** 2)
                delta /= (v[i, :].sum(dtype=np.int64) * v[:, j].sum(dtype=np.int64))
    delta *= n_total

    criterion = chdtri(r - 1, gamma)
    if delta > criterion:
        return f'r = {r}, Delta = {delta:0.4f}, criterion = {criterion:0.4f}. \n' \
               f'The statistical data do CONFLICT with the H0 hypothesis.'
    else:
        return f'r = {r}, Delta = {delta:0.4f}, criterion = {criterion:0.4f}. \n' \
               f'The statistical data do NOT CONFLICT with the H0 hypothesis.'
Ejemplo n.º 4
0
def power_compute(w = None, N = None, df = None, sig_level = None, power = None):
    k = None
    # power を求める
    if(power is None):
        k = chdtri(df, sig_level) # パーセント点を求める
        k = py_pchisq(q = k, df = df, ncp = N*w**2) # パーセント点 k 以下の面積を求める
    return k
Ejemplo n.º 5
0
def chi_experiment(alpha):
    """
    :param alpha: the scale parameter
    :return: line about accepting or rejecting a hypothesis
    """
    arr = np.random.exponential(scale=1 / alpha, size=n)
    r = int(20 * n / 1000)

    z_gamma = chdtri(r - 1, gamma)

    nu, bin_edges = np.histogram(arr,
                                 bins=r,
                                 range=(expon.ppf(0.001), expon.ppf(0.999)))

    p = np.array([
        expon.sf(x=bin_edges[i - 1], scale=1) -
        expon.sf(x=bin_edges[i], scale=1) for i in range(1, r + 1)
    ])

    delta = np.sum(((nu - n * p)**2) / (n * p))

    if delta > z_gamma:
        return f'r = {r}, z_gamma = {z_gamma:.3f}, delta = {delta:.3f}. \n' \
               f'The statistical data do CONFLICT with the H0 hypothesis.'

    else:
        return f'z_gamma = {z_gamma:.3f}, delta = {delta:.3f}. \n' \
               f'The statistical data do NOT CONFLICT with the H0 hypothesis.'
Ejemplo n.º 6
0
def chisq(fh):
    '''
	Parses .chisq files. See docs/file_formats_sumstats.txt
	
	'''
    dtype_dict = {
        #		'CHR': str,
        'SNP': str,
        #		'CM': float,
        #		'BP': int,
        'P': float,
        'CHISQ': float,
        'N':
        int,  # cast to int for typechecking, then switch to float for division
        'MAF': float,
        'INFO': float,
    }
    colnames = open(fh, 'rb').readline().split()
    usecols = ['SNP', 'P', 'CHISQ', 'N', 'MAF', 'INFO']
    usecols = [x for x in usecols if x in colnames]
    try:
        x = pd.read_csv(fh,
                        header=0,
                        delim_whitespace=True,
                        usecols=usecols,
                        dtype=dtype_dict)
    except AttributeError as e:
        raise AttributeError('Improperly formatted chisq file: ' + e.args)

    try:
        check_N(x['N'])
    except KeyError as e:
        raise KeyError('No column named N in .betaprod: ' + str(e.args))

    x['N'] = x['N'].astype(float)

    try:
        check_rsid(x['SNP'])
    except KeyError as e:
        raise KeyError('No column named SNP in .betaprod: ' + str(e.args))

    if 'MAF' in x.columns:
        check_maf(x['MAF'])
        x['MAF'] = np.fmin(x['MAF'], 1 - x['MAF'])

    if 'P' in x.columns:
        check_pvalue(x['P'])
        x['P'] = chdtri(1, x['P'])
        x.rename(columns={'P': 'CHISQ'}, inplace=True)
    elif 'CHISQ' in x.columns:
        check_chisq(x['CHISQ'])
    else:
        raise ValueError(
            '.chisq file must have a column labeled either P or CHISQ.')

    return x
Ejemplo n.º 7
0
def obj_function(w=None,
                 N=None,
                 df=None,
                 sig_level=None,
                 power=None,
                 nc0=None):
    """
    Objective function which is used for the w and N computation.
    """
    k = chdtri(df, sig_level)  # Set cumulative distribution function.
    k = py_pchisq(q=k, df=df, ncp=N * w**2, nc0=nc0) - power
    return k
Ejemplo n.º 8
0
def power_compute(w=None,
                  N=None,
                  df=None,
                  sig_level=None,
                  power=None,
                  nc0=None):
    """
    Compute power with other variables.
    """
    k = None  # set k
    if (power is None):
        k = chdtri(df, sig_level)  # Compute percentile
    return py_pchisq(q=k, df=df, ncp=N * w**2, nc0=nc0)
Ejemplo n.º 9
0
def chisq(fh):
	'''
	Parses .chisq files. See docs/file_formats_sumstats.txt
	
	'''
	dtype_dict = {
#		'CHR': str,
		'SNP': str,
#		'CM': float,
#		'BP': int,
		'P': float,
		'CHISQ': float,
		'N': int, # cast to int for typechecking, then switch to float for division
		'MAF': float,
		'INFO': float,
	}
	colnames = open(fh,'rb').readline().split()
	usecols = ['SNP','P','CHISQ','N','MAF','INFO']	
	usecols = [x for x in usecols if x in colnames]
	try:
		x = pd.read_csv(fh, header=0, delim_whitespace=True, usecols=usecols, 
			dtype=dtype_dict)
	except AttributeError as e:
		raise AttributeError('Improperly formatted chisq file: '+e.args)

	try:
		check_N(x['N'])	
	except KeyError as e:
		raise KeyError('No column named N in .betaprod: '+str(e.args))

	x['N'] = x['N'].astype(float)
	
	try:
		check_rsid(x['SNP'])
	except KeyError as e:
		raise KeyError('No column named SNP in .betaprod: '+str(e.args))
	
	if 'MAF' in x.columns:
		check_maf(x['MAF'])
		x['MAF'] = np.fmin(x['MAF'], 1-x['MAF'])
	
	if 'P' in x.columns:
		check_pvalue(x['P'])
		x['P'] = chdtri(1, x['P']); 
		x.rename(columns={'P': 'CHISQ'}, inplace=True)
	elif 'CHISQ' in x.columns:
		check_chisq(x['CHISQ'])
	else:
		raise ValueError('.chisq file must have a column labeled either P or CHISQ.')

	return x
Ejemplo n.º 10
0
def max_noise_power(bins, n=1, confidence=0.99):
    """
    max_noise_power(bins, n=1, confidence=0.99):
        Return the power level that gives you some
        'confidence' that spectral noise could not cause
        that level in your power spectrum.  The total number
        of independent frequencies searched is 'bins'.
        This is P_detect in Vaughan et. al, 1994, and is also
        known as P_threshold.
    """
    if (n==1):
        return -Num.log((1.0 - confidence) / bins)
    else:
        return 0.5 * chdtri(2.0 * n, (1.0 - confidence) / bins)
Ejemplo n.º 11
0
def max_noise_power(bins, n=1, confidence=0.99):
    """
    max_noise_power(bins, n=1, confidence=0.99):
        Return the power level that gives you some
        'confidence' that spectral noise could not cause
        that level in your power spectrum.  The total number
        of independent frequencies searched is 'bins'.
        This is P_detect in Vaughan et. al, 1994, and is also
        known as P_threshold.
    """
    if (n == 1):
        return -Num.log((1.0 - confidence) / bins)
    else:
        return 0.5 * chdtri(2.0 * n, (1.0 - confidence) / bins)
Ejemplo n.º 12
0
def critical_value(mu_test, sigma, alpha):
    """Estimate the critical value for -2log(likelihood ratio) from bisecting CDF.

    Parameters
    ----------
    mu_test : float
        The assumed expectation value of the distribution to test.
    sigma : float
        The standaard deviation of the distribution.
    alpha : float (between 0 and 1)
        The significance level (i.e. 1-CL).

    Returns
    -------
    sol : float
        The critical value for :math:`\lambda=-2\log(\Lambda)`.

    See also
    --------
    Rotes Buch V, p.57--59
    """
    if mu_test > 0.0:

        def target(l):
            return 1 - alpha - neg_2_log_likelihood_ratio_CDF(
                l, mu_test, sigma)

        a = 0.0
        fa = target(a)
        if fa < 0:
            raise RuntimeError("1 - alpha - CDF_{-2ln(R)}(0) is negative!")
        b = 10.0
        while target(b) > 0:
            b = 10 * b
        sol = bisect(target, 0.0, b)
    else:
        # p_gz = 1 - stats.norm.cdf(0, loc=mu_test, scale=sigma)
        p_gz = 1 - special.ndtr(-mu_test / sigma)
        t = 1.0 - alpha / p_gz
        if t > 0:
            # sol = stats.chi2.ppf(1.0 - alpha / p_gz, 1)
            sol = special.chdtri(1, alpha / p_gz)
        else:
            print 'encountered questionable value < 0.0'
            sol = 0.0

    return sol
Ejemplo n.º 13
0
def ptosAdentro(x, y, muX, muY, C, p):
    '''
    calcular la cantidad de puntos que caen adentro para una dada probablidad
    '''
    # calculo ma raiz cuadrada de C. tal que A.dot(A.T) = C
    l, v = ln.eig(ln.inv(C))
    A = np.sqrt(l.real) * v

    # llevo los vectores a la forma linealizada
    X = np.array([x - muX, y - muY]).T.dot(A)

    # radio^2 para 2D y probabiliad p de que esten adentro
    r2 = chdtri(2, 1 - p)

    adentro = np.sum(ln.norm(X, axis=1) <= np.sqrt(r2)) / x.shape[0]

    return adentro
Ejemplo n.º 14
0
def ptosAdentro(x, y, muX, muY, C, p):
    '''
    calcular la cantidad de puntos que caen adentro para una dada probablidad
    '''
    # calculo ma raiz cuadrada de C. tal que A.dot(A.T) = C
    l, v = ln.eig(ln.inv(C))
    A = np.sqrt(l.real) * v

    # llevo los vectores a la forma linealizada
    X = np.array([x - muX, y - muY]).T.dot(A)

    # radio^2 para 2D y probabiliad p de que esten adentro
    r2 = chdtri(2, 1 - p)

    adentro = np.sum(ln.norm(X, axis=1) <= np.sqrt(r2)) / x.shape[0]

    return adentro
Ejemplo n.º 15
0
def chi_square():

    sample_1 = np.random.uniform(0, 1, size=n)
    sample_2 = sample_1 + np.random.uniform(-1, 1, size=n)

    df = pd.DataFrame(data=[sample_1, sample_2]).T

    bins_x = np.linspace(sample_1.min(initial=0).round(),
                         sample_1.max(initial=1).round(),
                         num=r)
    bins_y = np.linspace(sample_2.min(initial=-1).round(),
                         sample_2.max(initial=2).round(),
                         num=k)

    df.iloc[:, 0] = bins_x[np.digitize(df.iloc[:, 0], bins=bins_x)]
    df.iloc[:, 1] = bins_y[np.digitize(df.iloc[:, 1], bins=bins_y)]

    v = pd.crosstab(df.iloc[:, 0], df.iloc[:, 1]).to_numpy()

    n_total = r + k
    delta = 0
    for i in range(v.shape[0]):
        for j in range(v.shape[1]):
            delta += ((v[i, j] - v.sum(axis=1, dtype=np.int64)[i] *
                       v.sum(axis=0, dtype=np.int64)[j] / n_total)**2)
            delta /= (v.sum(axis=1, dtype=np.int64)[i] *
                      v.sum(axis=0, dtype=np.int64)[j])
    delta *= n_total
    criterion = chdtri((r - 1) * (k - 1), gamma)

    if delta > criterion:
        return f'r = {r}, k = {k}, delta = {delta:0.4f}, criterion = {criterion:0.4f} \n' \
               f'The statistical data do CONFLICT with the H0 hypothesis. \n'
    else:
        return f'r = {r}, k = {k}, delta = {delta:0.4f}, criterion = {criterion:0.4f} \n' \
               f'The statistical data do NOT CONFLICT with the H0 hypothesis. \n'
Ejemplo n.º 16
0
def get_infectiousness(share_time,
                       degree,
                       p_time,
                       max_window=7200,
                       min_window=300,
                       min_count=5):
    """Estimate the infectiousness of an information cascade

    Arguments:
        share_time {numpy.ndarray} -- observed resharing times, sorted, share_time[0]=0
        degree {numpy.ndarray} -- observed node degrees
        p_time {numpy.ndarray} -- equally spaced vector of time to estimate the infectiousness, p_time[0]=0

    Keyword Arguments:
        max_window {int} -- maximum span of the locally weight kernel (default: {7200})
        min_window {int} -- minimum span of the locally weight kernel (default: {300})
        min_count {int} -- the minimum number of resharings included in the window (default: {5})

    Returns:
        tuple -- returns the tuple of vectors (infectiousness, p_up, p_low). The vectors represent the infectiousness
        and its upper and lower bounds for each time in the given `p_time`
    """
    share_time = np.sort(share_time)
    slopes = 2 / (p_time + 1e-8)
    slopes[slopes < 1 / max_window] = 1 / max_window
    slopes[slopes > 1 / min_window] = 1 / min_window

    windows = (p_time + 1e-8) / 2
    windows[windows > max_window] = max_window
    windows[windows < min_window] = min_window

    for j in range(len(p_time)):
        ind = (share_time >= p_time[j] - windows[j]) & (share_time < p_time[j])

        if len(ind) < min_count:
            ind2 = share_time < p_time[j]
            lcv = len(ind2)
            ind = ind2[max(lcv - min_count, 0):lcv]
            slopes[j] = 1 / (p_time[j] - share_time[ind[1]])
            windows[j] = p_time[j] - share_time[ind[1]]

    M_I = np.zeros((len(share_time), len(p_time)))
    for j in range(len(p_time)):
        M_I[:, j] = degree * integral_memory_kernel(p_time[j], share_time,
                                                    slopes[j], windows[j])

    infectiousness_seq = np.zeros(len(p_time))
    p_low_seq = np.zeros(len(p_time))
    p_up_seq = np.zeros(len(p_time))
    share_time = share_time[1:]

    for j in range(len(p_time)):
        share_time_tri = share_time[(share_time >= p_time[j] - windows[j])
                                    & (share_time < p_time[j])]
        rt_count_weighted = np.sum(slopes[j] * (share_time_tri - p_time[j]) +
                                   1)
        I = np.sum(M_I[:, j])
        rt_num = len(share_time_tri)
        if rt_count_weighted == 0:
            continue
        else:
            infectiousness_seq[j] = rt_count_weighted / I
            quant_low = chdtri(
                2 * rt_num,
                0.95)  # this function is weird and you hand it 1-quantile
            p_low_seq[j] = infectiousness_seq[j] * quant_low / (2 * rt_num)

            quant_up = chdtri(2 * rt_num, 0.05)
            p_up_seq[j] = infectiousness_seq[j] * quant_up / (2 * rt_num)

    return infectiousness_seq, p_up_seq, p_low_seq
Ejemplo n.º 17
0
def get_infectiousness(
    share_time,
    degree,
    p_time,
    max_window=2 * 60 * 60,
    min_window=300,
    min_count=5,
):

    # ix <- sort(share_time, index.return=TRUE)$ix
    # share_time <- share_time[ix]
    share_time = np.sort(share_time)

    #   slopes <- 1/(p_time/2)
    slopes = 1 / (
        p_time / 2
    )  # = 2/p_time?  TODO above says p_time[1]=0, but if so then we divide by 0 here
    #   slopes[slopes < 1/max_window] <- 1/max_window
    slopes[slopes < 1 / max_window] = 1 / max_window
    #   slopes[slopes > 1/min_window] <- 1/min_window
    slopes[slopes > 1 / min_window] = 1 / min_window

    #   windows <- p_time/2
    windows = p_time / 2
    #   windows[windows > max_window] <- max_window
    windows[windows > max_window] = max_window
    #   windows[windows < min_window] <- min_window
    windows[windows < min_window] = min_window

    #   for(j in c(1:length(p_time))) {
    for j in range(len(p_time)):  # XXX range ok here?
        #       ind <- which(share_time >= p_time[j] - windows[j] & share_time < p_time[j])
        ind = (share_time >= p_time[j] - windows[j]) & (share_time < p_time[j])

        #       if(length(ind) < min_count) {
        if len(ind) < min_count:
            #           ind2 <- which(share_time < p_time[j])
            ind2 = share_time < p_time[j]
            #           lcv <- length(ind2)
            lcv = len(ind2)
            #           ind <-ind2[max((lcv-min_count),1):lcv]
            ind = ind2[max(lcv - min_count, 0):lcv]  # XXX indices correct?
            #           slopes[j] <-1/(p_time[j] - share_time[ind[1]])
            slopes[j] = 1 / (p_time[j] - share_time[ind[1]])

            #           windows[j] <- p_time[j] - share_time[ind[1]]
            windows[j] = p_time[j] - share_time[ind[1]]

#   M_I <- matrix(0,nrow=length(share_time),ncol=length(p_time))
    M_I = np.zeros((len(share_time), len(p_time)))
    #   for(j in 1:length(p_time)){
    for j in range(len(p_time)):  # XXX range ok?
        #       M_I[,j] <- degree*integral_memory_kernel(p_time[j], share_time, slopes[j], windows[j])
        M_I[:, j] = degree * integral_memory_kernel(p_time[j], share_time,
                                                    slopes[j], windows[j])
#   infectiousness_seq <- rep(0, length(p_time))
    infectiousness_seq = np.zeros(len(p_time))
    #   p_low_seq <- rep(0, length(p_time))
    p_low_seq = np.zeros(len(p_time))
    #   p_up_seq <- rep(0, length(p_time))
    p_up_seq = np.zeros(len(p_time))
    #   share_time <- share_time[-1]          #removes the original tweet from retweet
    share_time = share_time[
        1:]  # XXX slice correct? #removes the original tweet from retweet
    #   for(j in c(1:length(p_time))) {
    for j in range(len(p_time)):
        #       share_time_tri <- share_time[which(share_time >= p_time[j] - windows[j] & share_time < p_time[j])]
        share_time_tri = share_time[(share_time >= p_time[j] - windows[j])
                                    & (share_time < p_time[j])]
        #       rt_count_weighted <-   sum(slopes[j]*(share_time_tri - p_time[j]) + 1)
        rt_count_weighted = np.sum(slopes[j] * (share_time_tri - p_time[j]) +
                                   1)
        #       I <- sum(M_I[,j])
        I = np.sum(M_I[:, j])  # XXX slice correct?
        #       rt_num <- length(share_time_tri)
        rt_num = len(share_time_tri)
        #       if (rt_count_weighted==0)
        if rt_count_weighted == 0:
            #           next
            continue
#       else {
        else:
            #           infectiousness_seq[j] <- (rt_count_weighted)/I
            infectiousness_seq[j] = rt_count_weighted / I
            #           p_low_seq[j] <- infectiousness_seq[j] * qchisq(0.05, 2*rt_num) / (2*rt_num)
            quant_low = chdtri(2 * rt_num,
                               1 - 0.05)  # XXX quantile equivalent?
            p_low_seq[j] = infectiousness_seq[j] * quant_low / (2 * rt_num)

            #           p_up_seq[j] <- infectiousness_seq[j] * qchisq(0.95, 2*rt_num) / (2*rt_num)
            quant_up = chdtri(2 * rt_num, 1 - 0.95)  # XXX quantile equivalent?
            p_up_seq[j] = infectiousness_seq[j] * quant_up / (2 * rt_num)

#   ## p_low_seq[is.nan(p_low_seq)] <- 0
#   ## p_up_seq[is.nan(p_up_seq)] <- 0
#   list(infectiousness = infectiousness_seq, p.up = p_up_seq, p.low = p_low_seq)
    return infectiousness_seq, p_up_seq, p_low_seq
Ejemplo n.º 18
0
'''
para arreglar porque no me da bien la grafica de las elipses a partir de las
matrices de covarianza
'''
# %% imports
import numpy as np
import scipy.linalg as ln
from scipy.special import chdtri, chdtrc
import matplotlib.pyplot as plt

# %%
fi = np.linspace(0, 2 * np.pi, 100)
r = np.sqrt(chdtri(2, 0.1))  # radio para que 90% caigan adentro
# r = 1
Xcirc = np.array([np.cos(fi), np.sin(fi)]) * r


def unit2CovTransf(c):
    '''
    returns the matrix that transforms points from unit normal pdf to a normal
    pdf of covariance C. so that
    Xnorm = np.random.randn(2,n)  # generate random points in 2D
    T = unit2CovTransf(C)  # calculate transform matriz
    X = np.dot(T, Xnorm)  # points that follow normal pdf of cov C
    '''
    l, v = ln.eig(C)

    # matrix such that A.dot(A.T)==C
    T = np.sqrt(l.real) * v

    return T
Ejemplo n.º 19
0
#print(py_pchisq(q = 0.5, df = 2, ncp = 0.001, lower_tail = True, log_p = False))


def power_compute(w = None, N = None, df = None, sig_level = None, power = None):
    k = None
    # power を求める
    if(power is None):
        k = chdtri(df, sig_level) # パーセント点を求める
        k = py_pchisq(q = k, df = df, ncp = N*w**2) # パーセント点 k 以下の面積を求める
    return k

#print(power_compute(w = 0.2, N = 500, df = 1, sig_level = 0.05, power = None)) # あってる



k = chdtri(df, sig_level)
def obj_function(w = None, N = None, df = None, sig_level = None, power = None):
    return  py_pchisq(q = k, df = df, ncp = N*w**2) - power # パーセント点 k 以下の面積を求める


# w を計算する
if(w is None):

# obj_functionの挙動の確認
    for i in range(100000):
        #print('i = ', i)
        w0 = i/100000
#        print('w = ', w0)
        p = obj_function(w = w0, N = N, df = df, sig_level = sig_level, power = power)
#       print(p)
        if(abs(p) < 0.001):
Ejemplo n.º 20
0
def _invchi2(q, df):
    return special.chdtri(df, q)
Ejemplo n.º 21
0
def betaprod(fh):
    '''
	Parses .betaprod files. See docs/file_formats_sumstats.txt
	
	'''
    dtype_dict = {
        #		'CHR': str,
        'SNP': str,
        #		'CM': float,
        #		'BP': int,
        'P1': float,
        'CHISQ1': float,
        'DIR1': float,
        'N1':
        int,  # cast to int for typechecking, then switch to float later for division
        'P2': float,
        'CHISQ2': float,
        'DIR2': float,
        'N2': int,
        'INFO1': float,
        'INFO2': float,
        'MAF1': float,
        'MAF2': float
    }
    colnames = open(fh, 'rb').readline().split()
    usecols = [
        x + str(i) for i in xrange(1, 3)
        for x in ['DIR', 'P', 'CHISQ', 'N', 'MAF', 'INFO']
    ]
    usecols.append('SNP')
    usecols = [x for x in usecols if x in colnames]
    try:
        x = pd.read_csv(fh,
                        header=0,
                        delim_whitespace=True,
                        usecols=usecols,
                        dtype=dtype_dict)
    except AttributeError as e:
        raise AttributeError('Improperly formatted betaprod file: ' +
                             str(e.args))

    try:
        check_rsid(x['SNP'])
    except KeyError as e:
        raise KeyError('No column named SNP in .betaprod: ' + str(e.args))

    for i in ['1', '2']:
        N = 'N' + i
        P = 'P' + i
        CHISQ = 'CHISQ' + i
        DIR = 'DIR' + i
        MAF = 'MAF' + i
        INFO = 'INFO' + i
        BETAHAT = 'BETAHAT' + i
        try:
            check_N(x[N])
        except KeyError as e:
            raise KeyError('No column named {N} in .betaprod: '.format(N=N) +
                           str(e.args))
        x[N] = x[N].astype(float)
        try:
            check_dir(x[DIR])
        except KeyError as e:
            raise KeyError('No column named {D} in .betaprod: '.format(D=DIR) +
                           str(e.args))

        if CHISQ in x.columns:
            check_chisq(x[CHISQ])
            betahat = np.sqrt(x[CHISQ] / x[N]) * x[DIR]
            x[CHISQ] = betahat
            x.rename(columns={CHISQ: BETAHAT}, inplace=True)
        elif P in x.columns:
            check_pvalue(x[P])
            betahat = np.sqrt(chdtri(1, x[P]) / x[N]) * x[DIR]
            x[P] = betahat
            x.rename(columns={P: BETAHAT}, inplace=True)
        else:
            raise ValueError(
                'No column named P{i} or CHISQ{i} in betaprod.'.format(i=i))

        del x[DIR]
        if MAF in x.columns:
            check_maf(x[MAF])
            x[MAF] = np.min(x[MAF], 1 - x[MAF])

    return x
Ejemplo n.º 22
0
    rootsPoly = roots(poly)
    realRoots = rootsPoly[isreal(rootsPoly)].real
    rMax = max(realRoots)

    r = linspace(0, rMax)
    rDist = polyval(pNum, r) / polyval(pDen, r)

    figure()
    plot(r, rDist)
    xlabel("radio")
    ylabel("radio distorsionado")


# %%
fi = linspace(0, 2 * pi, 100)
r = sqrt(chdtri(2, 0.1))  # radio para que 90% caigan adentro
# r = 1
Xcirc = array([cos(fi), sin(fi)]) * r



def plotEllipse(ax, C, mux, muy, col):
    '''
    se grafica una elipse asociada a la covarianza c, centrada en mux, muy
    '''

    T = unit2CovTransf(C)
    # roto reescaleo para lleve del circulo a la elipse
    xeli, yeli = dot(T, Xcirc)

    ax.plot(xeli+mux, yeli+muy, c=col, lw=0.5)
Ejemplo n.º 23
0
def transform(cube):
    return chdtri(1, cube)
Ejemplo n.º 24
0
def ChiMerge(df,col,target,max_bin=5,p=0.1,min_binpct=0,special_attribute=[],special_cols=[]):

    
    if col in special_cols:                             #如果是存在特殊值的变量,df2是不包含特殊值行的
        
        df2 = df.loc[~df[col].isin(special_attribute)]
        col_max=df2[col].max()       
    else:
        df2=df.copy()

    N = df2.shape[0]
        
    col_unique=sorted(list(set(df2[col])))              # 变量的唯一值并排序,不包含特殊值
    n=len(col_unique)

    if n <=max_bin:
        print("The number of original levels for {} is less than or equal to max_bin".format(col))
        df2['col_map'] = df2[col]
        (dict_bad,regroup,all_bad_rate) = bin_bad_rate(df2,'col_map',target,grantRateIndicator=1)
        cutoffpoints= col_unique[:-1]
        
    else:
        if n>100:  # 如果变量的唯一值数目超过100,则将通过split_data和assign_group将x映射为split对应的value
            split_col = split_data(df2,col,100)  # 通过这个目的将变量的唯一值数目人为设定为100
            df2['col_map'] = df2[col].map(lambda x:assign_group(x,split_col))
        else:
            df2['col_map'] = df2[col]  # 变量的唯一值数目没有超过100,则不用做映射
        
        (dict_bad,regroup,all_bad_rate) = bin_bad_rate(df2,'col_map',target,grantRateIndicator=1)
        col_map_unique = sorted(list(set(df2['col_map'])))  # 对变量映射后的value进行去重排序
        group_interval = [[i] for i in col_map_unique]  # 对col_map_unique中每个值创建list并存储在group_interval中
    
        chiq_limit=chdtri(1, p)
        chi_min=0 #初始化一个最小卡方值(当p=0.1时,chi2=2.7,所以设0没问题)

        while (len(group_interval)>max_bin) or (chi_min < chiq_limit) : # 当group_interval的长度大于max_bin时或最小卡方值小于临界值,执行while循环(只要有一个True就执行)
            chi_list=[]
            for i in range(len(group_interval)-1):
                temp_group = group_interval[i]+group_interval[i+1] # temp_group 为生成的区间,list形式,例如[1,3]
                chi_df = regroup[regroup['col_map'].isin(temp_group)]
                chi_value = cal_chi2(chi_df,all_bad_rate) # 计算每一对相邻区间的卡方值
                chi_list.append(chi_value)
            # 将卡方值最小的一对区间进行合并   
            best_combined = chi_list.index(min(chi_list)) # 最小的卡方值的索引           
            group_interval[best_combined] = group_interval[best_combined]+group_interval[best_combined+1]
            # 删除合并前的右区间
            group_interval.remove(group_interval[best_combined+1])
            chi_min = min(chi_list)

        # 对合并后每个区间进行排序
        group_interval = [sorted(i) for i in group_interval]
        # cutoff点为每个区间的最大值
        cutoffpoints = [max(i) for i in group_interval[:-1]]

    # 检查是否有箱只有好样本或者只有坏样本
    df2['col_map_bin'] = df2['col_map'].apply(lambda x:assign_bin(x,cutoffpoints)) # 用cutoffpoints将col_map映射为对应的区间Bin
    # 计算每个区间的违约率
    (dict_bad,regroup) = bin_bad_rate(df2,'col_map_bin',target)
    # 计算最小和最大的违约率
    [min_bad_rate,max_bad_rate] = [min(dict_bad.values()),max(dict_bad.values())]

    # 当最小的违约率等于0,说明区间内只有好样本,当最大的违约率等于1,说明区间内只有坏样本
    while min_bad_rate==0 or max_bad_rate==1:
        bad01_index = regroup[regroup['bad_rate'].isin([0,1])].col_map_bin.tolist()# 违约率为1或0的区间
        bad01_bin = bad01_index[0]
        if bad01_bin==max(regroup.col_map_bin):
            cutoffpoints = cutoffpoints[:-1] # 当bad01_bin是最大的区间时,删除最大的cutoff点
        elif bad01_bin==min(regroup.col_map_bin):
            cutoffpoints = cutoffpoints[1:] # 当bad01_bin是最小的区间时,删除最小的cutoff点
        else:
            bad01_bin_index = list(regroup.col_map_bin).index(bad01_bin) # 找出bad01_bin的索引
            prev_bin = list(regroup.col_map_bin)[bad01_bin_index-1] # bad01_bin前一个区间
            df3 = df2[df2.col_map_bin.isin([prev_bin,bad01_bin])] 
            (dict_bad,regroup1) = bin_bad_rate(df3,'col_map_bin',target)
            chi1 = cal_chi2(regroup1,all_bad_rate)  # 计算前一个区间和bad01_bin的卡方值
            later_bin = list(regroup.col_map_bin)[bad01_bin_index+1] # bin01_bin的后一个区间
            df4 = df2[df2.col_map_bin.isin([later_bin,bad01_bin])] 
            (dict_bad,regroup2) = bin_bad_rate(df4,'col_map_bin',target)
            chi2 = cal_chi2(regroup2,all_bad_rate) # 计算后一个区间和bad01_bin的卡方值
            if chi1<chi2:  # 当chi1<chi2时,删除前一个区间对应的cutoff点
                cutoffpoints.remove(cutoffpoints[bad01_bin_index-1])
            else:  # 当chi1>=chi2时,删除bin01对应的cutoff点
                cutoffpoints.remove(cutoffpoints[bad01_bin_index])
                
        df2['col_map_bin'] = df2['col_map'].apply(lambda x:assign_bin(x,cutoffpoints)) 
        (dict_bad,regroup) = bin_bad_rate(df2,'col_map_bin',target)
        # 重新将col_map映射至区间,并计算最小和最大的违约率,直达不再出现违约率为0或1的情况,循环停止
        [min_bad_rate,max_bad_rate] = [min(dict_bad.values()),max(dict_bad.values())]

    # 检查分箱后的最小占比
    if min_binpct>0:
        group_values = df2['col_map'].apply(lambda x:assign_bin(x,cutoffpoints)) #用经过前面检验的新的cutoffpoints来映射
        df2['col_map_bin'] = group_values # 将col_map映射为对应的区间Bin
        group_df = group_values.value_counts().to_frame().sort_index()
        group_df['bin_pct'] = group_df['col_map']/N # 计算每个区间的占比
        min_pct = group_df.bin_pct.min() # 得出最小的区间占比
        while min_pct<min_binpct and len(cutoffpoints)>2: # 当最小的区间占比小于min_pct且cutoff点的个数大于2,执行循环
            # 下面的逻辑基本与“检验是否有箱体只有好/坏样本”的一致
            min_pct_index = group_df[group_df.bin_pct==min_pct].index.tolist()
            min_pct_bin = min_pct_index[0]
            if min_pct_bin == max(group_df.index):
                cutoffpoints=cutoffpoints[:-1]
            elif min_pct_bin == min(group_df.index):
                cutoffpoints=cutoffpoints[1:]
            else:
                minpct_bin_index = list(group_df.index).index(min_pct_bin)
                prev_pct_bin = list(group_df.index)[minpct_bin_index-1]
                df5 = df2[df2['col_map_bin'].isin([min_pct_bin,prev_pct_bin])]
                (dict_bad,regroup3) = bin_bad_rate(df5,'col_map_bin',target)
                chi3 = cal_chi2(regroup3,all_bad_rate)
                later_pct_bin = list(group_df.index)[minpct_bin_index+1]
                df6 = df2[df2['col_map_bin'].isin([min_pct_bin,later_pct_bin])]
                (dict_bad,regroup4) = bin_bad_rate(df6,'col_map_bin',target)
                chi4 = cal_chi2(regroup4,all_bad_rate)
                if chi3<chi4:
                    cutoffpoints.remove(cutoffpoints[minpct_bin_index-1])
                else:
                    cutoffpoints.remove(cutoffpoints[minpct_bin_index])

            group_values = df2['col_map'].apply(lambda x:assign_bin(x,cutoffpoints)) #用经过前面检验的新的cutoffpoints来映射
            df2['col_map_bin'] = group_values # 将col_map映射为对应的区间Bin
            group_df = group_values.value_counts().to_frame().sort_index()
            group_df['bin_pct'] = group_df['col_map']/N # 计算每个区间的占比
            min_pct = group_df.bin_pct.min() # 得出最小的区间占比
                    
    if col in special_cols:
        cutoffpoints.append(col_max)   #存在不参与分箱的值时,切割点要加上该变量下的最大值
        
    return cutoffpoints #切割点,为每个分箱的最大值端点(存在不参与分箱的值时,切割点要加上该变量下的最大值;反之,最后一个点是最后一箱的最小值)
Ejemplo n.º 25
0
def chi2isf(p, df):
    return special.chdtri(df, p)
Ejemplo n.º 26
0
def betaprod(fh):
	'''
	Parses .betaprod files. See docs/file_formats_sumstats.txt
	
	'''
	dtype_dict = {
#		'CHR': str,
		'SNP': str,
#		'CM': float,
#		'BP': int,
		'P1': float,
		'CHISQ1': float,
		'DIR1': float,
		'N1': int, # cast to int for typechecking, then switch to float later for division
		'P2': float,
		'CHISQ2': float,
		'DIR2': float,
		'N2': int,
		'INFO1': float,
		'INFO2': float,
		'MAF1': float,
		'MAF2': float
	}
	colnames = open(fh,'rb').readline().split()
	usecols = [x+str(i) for i in xrange(1,3) for x in ['DIR','P','CHISQ','N','MAF','INFO']]
	usecols.append('SNP')
	usecols = [x for x in usecols if x in colnames]
	try:
		x = pd.read_csv(fh, header=0, delim_whitespace=True, usecols=usecols, 
			dtype=dtype_dict)
	except AttributeError as e:
		raise AttributeError('Improperly formatted betaprod file: '+str(e.args))
		
	try:
		check_rsid(x['SNP'])
	except KeyError as e:
		raise KeyError('No column named SNP in .betaprod: '+str(e.args))
	
	for i in ['1','2']:
		N='N'+i; P='P'+i; CHISQ='CHISQ'+i; DIR='DIR'+i; MAF='MAF'+i; INFO='INFO'+i
		BETAHAT='BETAHAT'+i
		try:
			check_N(x[N])
		except KeyError as e:
			raise KeyError('No column named {N} in .betaprod: '.format(N=N)+str(e.args))
		x[N] = x[N].astype(float)	
		try:
			check_dir(x[DIR])
		except KeyError as e:
			raise KeyError('No column named {D} in .betaprod: '.format(D=DIR)+str(e.args))
	
		if CHISQ in x.columns:
			check_chisq(x[CHISQ])
			betahat = np.sqrt(x[CHISQ]/x[N]) * x[DIR]
			x[CHISQ] = betahat
			x.rename(columns={CHISQ: BETAHAT}, inplace=True)
		elif P in x.columns:
			check_pvalue(x[P])
			betahat = np.sqrt(chdtri(1, x[P])/x[N])	* x[DIR]
			x[P] = betahat
			x.rename(columns={P: BETAHAT}, inplace=True)
		else:
			raise ValueError('No column named P{i} or CHISQ{i} in betaprod.'.format(i=i))

		del x[DIR]
		if MAF in x.columns:
			check_maf(x[MAF])
			x[MAF]  = np.min(x[MAF], 1-x[MAF])
		
	return x
Ejemplo n.º 27
0
# %% este ejemplo anda. algo falta con la diagonalizacion
from scipy.special import chdtri
import numpy as np

# %%
rho = -0.6
N = 100000
D = 2
S = np.diag(np.arange(D)+1)
S[[0, 1], [1, 0]] = np.sqrt(S[0, 0] + S[1, 1]) * rho

p = 0.74

x = np.random.randn(N, D)

r2 = chdtri(D, 1 - p)
pNum = np.sum(np.sum(x**2, axis=1) <= r2) / N

# print(r)
print(p)
print(pNum)








Ejemplo n.º 28
0
def calculoT(v, p):
    """
    v = Grados de libertad.
    p = Probabilidad. (¿ alfa ?)
    """
    return chdtri(v, p)
Ejemplo n.º 29
0
def _invchi2(q, df):
    return special.chdtri(df, q)
Ejemplo n.º 30
0
def _invchi2(q, df):
    return special.chdtri(df, q)  # pylint: disable=no-member