Exemple #1
0
    def filter(self, data, *args, **kwargs):

        if (len(data) == 4):
            c1 = data[0]
            n1 = data[1]
            c2 = data[2]
            n2 = data[3]
        else:
            c1 = data[0]
            n1 = self.n1
            c2 = data[1]
            n2 = self.n2

        if c1.ndim > 2:
            p = c1.shape[0]
        else:
            p = 1

        lnq = p * ((n1 + n2) * np.log(n1 + n2) - n1 * np.log(n1) - n2 * np.log(n2)) + \
              n1 * np.log(block_det(c1)) + \
              n2 * np.log(block_det(c2)) - \
              (n1 + n2) * np.log(block_det(c1 + c2))

        rho = 1 - (2 * p ** 2 - 1) * (1 / n1 + 1 / n2 - 1 / (n1 + n2)) / (6 * p)

        o_2 = p ** 2 * (p ** 2 - 1) * \
              (1 / n1 ** 2 + 1 / n2 ** 2 - 1 / (n1 + n2) ** 2) / (24 * rho ** 2) - \
              0.25 * p ** 2 * (1 - 1 / rho) ** 2

        lnq *= -2 * rho

        pfa = (1 - o_2) * chi2.cdf(lnq, p ** 2) + o_2 * chi2.cdf(lnq, p ** 2 + 4)

        return (pfa, median_filter(pfa > self.p_thresh, 3, mode='constant', cval=0))
Exemple #2
0
def serial_test(bits, size=5):
    n = len(bits)
    omeg = [0,0,0]
    for i in range(min(3, size)):
        omeg[i] = 2**(size - i) / n * sum([x**2 for x in fqs(bits, size - i)]) - n
    domeg = [0,0]
    domeg[0] = omeg[0] - omeg[1]
    domeg[1] = omeg[0] - 2*omeg[1] + omeg[2]
    return [1 - chi2.cdf(domeg[0], 2**(size-1)), 1 - chi2.cdf(domeg[1], 2**(size - 2))]
Exemple #3
0
def truncated_chi2_mean(c, k):
    """
    chi2 mean up to the cutoff.

    Compute A/B with:
    A = integral[0..c] x chi2(k) dx
    B = integral[0..c] chi2(k) dx
    B is computed via chi2.cdf and A is computed via:
    A = k * integral[0..c] chi2(k+2) dx
    """
    A = k * chi2.cdf(c, k + 2)
    B = chi2.cdf(c, k)
    return A / B
Exemple #4
0
def Main():
    global args,out

    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        out=open(args.output,"w")
    print >>out,"# INPUT FILE:",args.input
    print >>out,"# PVALUE CUT:",args.pvalue
    print >>out,"# MAX    GAP:",args.gap
    for chrom,pos,x2s,matrix_x2,snps in parseIterRegion(args.input):
        '''
        computer pvalue (from i to j)
        p[i][j] is the pvalue
        we are here to segments the snps into smaller region
        '''
        print >>sys.stderr,"BiGSEGMent",chrom,len(pos),pos[0],pos[-1]
        if args.fast:
            if len(pos)>1000:
                SqOR=0.0
                for k in range(len(pos)):
                    SqOR+=x2s[k]
                p=1.0-chi2.cdf(SqOR,len(pos))
                print >>out,"BIGREGION\t",chrom,"\t",pos[0],"\t",pos[-1]+1,"\t",p

                for i in range(len(pos)):
                    print >>out,"SNP\t",chrom,pos[i],snps[i],x2s[i],matrix_x2[i]
                print >>out,""
                continue




        p=numpy.array([[1.0 for row in range(len(pos))] for col in range(len(pos))])

        for i in range(len(pos)):
            SqOR=0.0
            print >>sys.stderr,i,"\r",
            for j in range(i,len(pos)):
                SqOR+=x2s[j]
                p[i][j]=1.0-chi2.cdf(SqOR,j+1-i)
 #               print "PV",i,j,p[i][j],SqOR,j+1-i
        for (start,end) in Segment(p):
            #print start,end,p[start][end]
            if (p[start][end]<args.pvalue):
                print >>out,"REGION\t",chrom,"\t",pos[start],"\t",pos[end]+1,"\t",p[start][end]

                for i in range(start,end+1):
                    print >>out,"SNP\t",chrom,pos[i],snps[i],x2s[i],matrix_x2[i]
                print >>out,""
def SPLL(W1, W2, K=3):
    n1 = W1.shape[1]
    n2 = W2.shape[1]

    assert n1==n2, "The number of features must be the same for W1 and W2"
    n = n1
    s1 = log_LL(W1,W2,K)
    s2 = log_LL(W2,W1,K)

    st = max(s1,s2)

    pst = min(chi2.cdf(st,n), 1-chi2.cdf(st,n))
    Change = float(pst < 0.05)
    return Change, pst, st
Exemple #6
0
 def hypothesisTest(self, seq1, seq2, totalSeq1, totalSeq2):
   # Contingency table:
   # x1 x2
   # y1 y2
   x1 = seq1
   x2 = seq2
   y1 = totalSeq1 - x1
   y2 = totalSeq2 - x2
   
   if (x1 == 0 and x2 == 0) or (x1 == totalSeq1 or x2 == totalSeq2):
     return float('inf'), 1.0, 'degenerate case: suspect p-value'
   
   N = x1+x2+y1+y2
   
   E00 = float((x1+x2) * (x1+y1)) / N
   E01 = float((x1+x2) * (x2+y2)) / N
   E10 = float((y1+y2) * (x1+y1)) / N
   E11 = float((y1+y2) * (x2+y2)) / N
 
   X2 = (abs(x1 - E00)-0.5)**2 / E00
   X2 += (abs(x2 - E01)-0.5)**2 / E01
   X2 += (abs(y1 - E10)-0.5)**2 / E10
   X2 += (abs(y2 - E11)-0.5)**2 / E11
   
   # calculate p-value
   pValueTwoSided = 1.0 - chi2.cdf(X2,1)
 
   return float('inf'), pValueTwoSided, ''
Exemple #7
0
def combine_p_values(the_p_values, method='z', default_quantile=7.):
    """Combines p-values from repeat measurements into a single
    p-value.

    the_p_values: a list of p-values.

    method: String. 'z'|'fisher'.  'z' for using the weighted z-score.
    'fisher' for using fisher's combined probability test.

    default_quantile: Float.  Only used for z method.  The quantile to
    use when the software's normal inverse cdf(p-value) is infinite
    """
    if len(the_p_values) == 1 or sum(the_p_values) == 0:
        combined_p_value = sum(the_p_values)
        
    elif method.lower() == 'z':
        #combine p-values using weighted z-score.  To not deal with inifinite
        #values replace 
        the_quantiles = []
        for the_p in the_p_values:
            the_quantile = norm.ppf(1.-the_p)
            if isinf(the_quantile):
                the_quantile = default_quantile
            the_quantiles.append(the_quantile)
        combined_p_value = norm.sf(sum(the_quantiles) / len(the_quantiles)**0.5)
    elif method.lower() == 'fisher':
        combined_p_value = 1-chi2.cdf(-2*sum(map(log,
                                                    the_p_values)),
                                         2*len(the_p_values))


    return combined_p_value
	def pval(self, data):
		"""
		Determines whether this model id better then a simple gaussian model.
		Uses a KStest on the data and determines the p-val of this distribution
		and a gaussian distribution.
		returns:
			(TF, this_p)
		TF indicates whether this is a reasonable approximation of the data.
		"""
		t_data = self.treatdata(data)
		n_mean = N.mean(t_data)
		n_std = N.std(t_data)
		
		simple_g = m_modal([('norm', (data.mean(), data.std()), 1)])
		simple_g.use_bc = self.use_bc
		#give original data so not transformed twice
		ll_s = simple_g.log_likelihood(data)
		ll_this = self.log_likelihood(data)
		
		ratio = 2*(ll_this - ll_s)
		#print 'chi2', ratio
		
		# 6 degrees of freedom from Adam,
		# based on bimodality in blood paper
		# the real test uses 3
		# from likelihood ratio test in wikipedia
		this_p = 1-chi2.cdf(ratio, 3)
		#print (ll_s, ll_this), ratio, this_p
		TF = this_p > 0.05
		return TF, this_p
Exemple #9
0
 def getPValue(self):
     """returns p-value of chi^2 test"""
     p = chi2.cdf(self.getChisquare(), self.getDoF())
     if p < 0.5:
         return p, 'l'
     else:
         return 1 - p, 'r'
    def showTwilightPixelDeviationFromMedian(self,row,col):
        x = self.getChisq(row,col)
        reducedChisq = x['reducedChisq']
        chisq = x['chisq']
        percentDiffSpectrum = x['percentDiffSpectrum']
        deltaPercentDiffSpectrum = x['deltaPercentDiffSpectrum']
        nDeltaFromZero = x['nDeltaFromZero']
        degreesOfFreedom = x['degreesOfFreedom']
        print 'reduced chisq =',reducedChisq
        print 'P-value =',1-chi2.cdf(chisq,degreesOfFreedom)

        pop = PopUp(parent=self,title='showTwilightPixelDeviationFromMedian')
        pop.axes.errorbar(self.wvlBinEdges[:-1],percentDiffSpectrum,linestyle='-',color='k',yerr=deltaPercentDiffSpectrum)
        pop.axes.set_xlabel(r'$\lambda$ ($\AA$)')
        pop.axes.set_ylabel(r'percent difference')
        pop.axes.plot(self.wvlBinEdges[:-1],len(self.wvlBinEdges[:-1])*[0],'gray')
        axes2 = pop.axes.twinx()
        axes2.plot(self.wvlBinEdges[:-1],nDeltaFromZero,'m',alpha=.7)
        align_yaxis(pop.axes,0,axes2,0)
        axes2.set_ylabel(r'(pixelSpectrum-avgSpectrum)/$\sigma$',color='m')
        pop.axes.set_title('Deviation from Avg Spectrum (%d,%d)'%(row,col))
        pop.draw()

        weights = self.flatInfo['weights'][row,col]
        pop = PopUp(parent=self,title='showTwilightPixelDeviationFromMedian')
        pop.axes.step(self.wvlBinEdges[:-1],self.averageTwilightSpectrum/self.wvlBinWidths,'k',label='avg')
        pop.axes.step(self.wvlBinEdges[:-1],self.twilightSpectra[row,col]/self.wvlBinWidths,'b',label='weighted')
        pop.axes.step(self.wvlBinEdges[:-1],(self.twilightSpectra[row,col]/weights)/self.wvlBinWidths,'r',label='raw')
        pop.axes.set_xlabel(r'$\lambda$ ($\AA$)')
        pop.axes.set_ylabel(r'counts per $\AA$')
        pop.axes.set_title('Twilight Spectrum (%d,%d)'%(row,col))
        pop.axes.legend(loc='lower right')
        pop.draw()
def chisquare(observed, expected, threshold = 0.95, freedom = 0):
    '''Performs chi square test with given parameters
    observed is a list of observed values
    expected is a list of expected values
    observed and expected must be same length
    freedom is the degrees of freedom in the variable
    if freedom not provided, it will be set automatically as len - 1
    threshold is probability threshold desired

    return value is a True or False depending on whether or not list is statistically significant
    
    True for split is "by chance" and prune the decision
    False for split is not "by chance" and do not prune the decision
    '''
    if not freedom:
        freedom = len(expected) - 1
    
    chi = 0
    for ob, ex in zip(observed, expected):
        if ex:
            chi += (ob - ex) ** 2 / ex
        else:
            pass # Check what the mathematical convention for 0 is
            # looks like we should do
            # freedom = freedom - 1
    
    pval = 1 - chi2.cdf(chi, freedom)
    return pval < threshold
Exemple #12
0
    def __init__(self, x, alpha=0.05, max_points=1000):
        # compute Mardia test coefficient
        n, p = x.shape   # num points, num dimensions
        mu = np.mean(x, axis=0)
        C = np.cov(x.T, bias=1) if p > 1 else np.array([[np.var(x.T, ddof=1)]])
        # squared Mahalanobis distance matrix
        # Note: this forms a full n x n matrix of distances, so will
        # fail for a large number of points.  Kurtosis only requires
        # the diagonal elements so can be computed cheaply.  If there
        # is no order to the points, skew could be estimated using only
        # the block diagonal
        dx = (x - mu[None, :])[:max_points]
        D = np.dot(dx, np.linalg.solve(C, dx.T))
        kurtosis = np.sum(np.diag(D)**2)/n
        skewness = np.sum(D**3)/n**2

        kurtosis_stat = (kurtosis - p*(p+2)) / sqrt(8*p*(p+2)/n)
        raw_skewness_stat = n*skewness/6
        # Small sample correction converges to 1 as n increases, so it is
        # always safe to apply it
        small_sample_correction = (p+1)*(n+1)*(n+3)/((p+1)*(n+1)*n - n*6)
        skewness_stat = raw_skewness_stat * small_sample_correction
        dof = (p*(p+1)*(p+2))/6   # degrees of freedom for chisq test

        self.p_kurtosis = 2*(1 - norm.cdf(abs(kurtosis_stat)))
        self.p_skewness = 1 - chi2.cdf(skewness_stat, dof)
        self.reject_normal = self.p_kurtosis < alpha or self.p_skewness < alpha
        #print("kurtosis", kurtosis, kurtosis_stat, self.p_kurtosis)
        #print("skewness", skewness, skewness_stat, self.p_skewness)
        # compute entropy
        self.entropy = cov_entropy(C)
Exemple #13
0
def combined_p_fisher(p_values):
    """Computes the combined P-value using Fisher's combined probability
    test.
    """
    k = len(p_values)
    W = -2.0 * sum(map(math.log, p_values))
    return 1.0 - chi2.cdf(W, df=2*k)
def _do_lrt(null_lrt, annot_lrt):
    """
        Perform the likelihood ratio test using standard LRT. 
    """
    test = -2*(null_lrt - annot_lrt)
    test_result = 1 - chi2.cdf(test, 1)
    return test_result
def construct_tree(examples, attributes, threshold):
    if entropy(examples) == 0:
        return Node(classification=examples[0][label_index], examples=examples)
    elif len(attributes) == 0:
        counter = Counter(example[label_index] for example in examples)
        return Node(classification=max((counter[key], key) for key in counter)[1], examples=examples)
    else:
        best_attribute = get_best_attribute(examples, attributes)
        m, k = degrees_of_freedom(examples, best_attribute)
        s = s_value(m, k, examples, best_attribute)
        df = (m - 1) * (k - 1)
        p_value = 1 - chi2.cdf(s, df)
        if p_value > threshold:
            counter = Counter(example[label_index] for example in examples)
            return Node(classification=max((counter[key], key) for key in counter)[1], examples=examples)
        attributes.remove(best_attribute)
        root = Node(attribute=best_attribute, examples=examples)
        partition_map = partition_on_attribute(examples, best_attribute)
        for attribute_value in partition_map:
            partition = partition_map[attribute_value]
            if len(partition) == 0:
                counter = Counter(example[label_index] for example in examples)
                root.children[attribute_value] = Node(classification=max((counter[key], key) for key in counter)[1], examples=examples)
            else:
                root.children[attribute_value] = construct_tree(partition, attributes, threshold)
        attributes.append(best_attribute)
        return root
def test_count_chunk(gene_counts, disp_adj, sf, dmatrix0, dmatrix1, CFG, idx, log=False):

    pval = sp.zeros((gene_counts.shape[0], 1), dtype='float')
    pval.fill(sp.nan)

    for i in xrange(idx.shape[0]):

        if log:
            log_progress(i, idx.shape[0])

        if sp.isnan(disp_adj[i]):
            continue

        response = gene_counts[i, :].astype('int')

        if sp.sum(response[:response.shape[0] / 2] == 0) >= CFG['max_0_frac'] * response.shape[0] / 2:
            pval[i] = 1
            continue

        modNB0 = sm.GLM(response, dmatrix0, family=sm.families.NegativeBinomial(alpha=disp_adj[i]), offset=sp.log(sf))
        modNB1 = sm.GLM(response, dmatrix1, family=sm.families.NegativeBinomial(alpha=disp_adj[i]), offset=sp.log(sf))
        result0 = modNB0.fit()
        result1 = modNB1.fit()
        pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance, dmatrix1.shape[1] - dmatrix0.shape[1])

    if log:
        log_progress(idx.shape[0], idx.shape[0])
        print ''

    return (pval, idx)
def convert_chis_to_probs(chis,dof):

	chis = chis / np.min(chis) * dof
	prob =  1.0 - chi2.cdf(chis,dof)
	prob = prob / np.max(prob)

	return prob
Exemple #18
0
def assert_chisquared(observed, expected, bin_edges=None, alpha=0.05, title=''):
    """Assert that the "observed" counts are close enough the "expected"
    counts with a chi2 test at the given confidence level. If the test fails,
    we'll save a histogram to disk.
    """
    chi2_value = np.sum((observed - expected)**2 / expected)
    
    n_dof = len(observed) - 1 # number of degrees of freedom for the chi2 test
    pval = 1 - chi2distribution.cdf(chi2_value, n_dof)
    
    print 'observed'
    print observed
    print 'expected'
    print expected
    print 'pval', pval
    
    if pval < alpha:
        if HAVE_PYPLOT:
            pp.clf()
            pp.title(title)
            pp.bar(bin_edges[0:-1], observed, width=bin_edges[1] - bin_edges[0],
                   alpha=0.5, label='observed', color='red')
            pp.bar(bin_edges[0:-1], expected, width=bin_edges[1] - bin_edges[0],
                   alpha=0.5, label='expected', color='blue')
            pp.legend()
            for i in itertools.count():
                path = 'hist-%d.png' % i
                if not os.path.exists(path):
                    break
            pp.savefig(path)
        raise ValueError('p=%f (<%f), we reject null hypothesis that the distribution '
            'matches the expected one. saved histogram as %s' % (pval, alpha, path))
Exemple #19
0
def func(data) :
	# data is a 2D python array

	dim1 = len(data)

	if(dim1 > 0) :
		dim2 = len(data[0])

	if(dim1 <= 1 or dim2 <= 1) :
		print "Invalid data"
	else :

		# calculate the rowsums, colsums, tsum
		rowsums = [0]*dim1
		colsums = [0]*dim2
		tsum = 0

		for i in range(0,dim1):
			for j in range(0,dim2) :
				rowsums[i] = rowsums[i] + data[i][j]
				colsums[j] = colsums[j] + data[i][j]

		tsum = sum(rowsums)

		print "r : " + str(rowsums)
		print "c : " + str(colsums)
		print "t : " + str(tsum)

		chiS = 0
		exp = list()

		for i in range(0,2):
			d = list()
			for j in range(0,2) :
				e = (rowsums[i]*colsums[j])/tsum
				val = ((data[i][j] - e)**2)/e
				chiS = chiS + val
				d.append(val)
			exp.append(d)

		print chiS

		degree = (dim1 - 1)*(dim2 - 1)

		print degree

		print chi2.cdf(chiS, degree)
Exemple #20
0
def computeProbabilityOfObservedOffset(x, y, p=None):
    """Compute probability that p is consistent with mean of distribution.

    For a 2 dimensional distribution of points, given by ``x`` and ``y``,
    compute the probability that the mean of the distribution is consistent
    with the input point ``p``

    Inputs:
    -------------
    x, y
        (float) Input values. Require that ``len(x) == len(y)``

    Optional Inputs:
    -----------------
    p
        (array of length 2) Point to test. Default is [0,0]

    Returns:
    -----------
    probOffset
        (float) Probability that input point is consistent with mean
        of distribution.
    chiSquare
        (float) The chi squared of the point. For highly inconsistent points
        the computed probability of offset flatlines at zero. The chisquare
        value can then be used to estimate the relative consistencies of
        different points.


    Notes:
    ---------
    See ``plotErrorEllipse`` for a description of the algorithm
    """

    if p is None:
        p = [0, 0]
    p = np.array(p)
    assert(len(p) == 2)

    assert(len(x) == len(y))
    if len(x) < 2:
        raise ValueError("Need at least two points to compute probability of offset")

    mu = np.array([np.mean(x), np.mean(y)])
    cov = np.cov(x, y) / np.sqrt(len(x))

    eigenVals, eigenVecs = np.linalg.eigh(cov)
    v1 = eigenVecs[:, 0]
    v2 = eigenVecs[:, 1]

    pDash = (p-mu)
    offset_pix = np.array([np.dot(pDash, v1), np.dot(pDash, v2)])
    sigma = np.sqrt(eigenVals)
    offset_sigma = offset_pix / sigma

    s = np.sum(offset_sigma**2)
    probOffset = chi2.cdf(s, 2)

    return probOffset, s
Exemple #21
0
def corr_circular_linear(alpha, X):
    # Authors:  Jean-Remi King <*****@*****.**>
    #
    # Licence : BSD-simplified
    """

    Parameters
    ----------
        alpha : numpy.array, shape (n_angles,)
            The angular data (if n_dims == 1, repeated across all x dimensions)
        X : numpy.array, shape (n_angles, n_dims)
            The linear data
    Returns
    -------
        R : numpy.array, shape (n_dims)
            R values
        R2 : numpy.array, shape (n_dims)
            R square values
        p_val : numpy.array, shape (n_dims)
            P values

    Adapted from:
        Circular Statistics Toolbox for Matlab
        By Philipp Berens, 2009
        [email protected] - www.kyb.mpg.de/~berens/circStat.html
        Equantion 27.47
    """

    from scipy.stats import chi2
    from jr.utils import pairwise
    import numpy as np

    # computes correlation for sin and cos separately
    # WIP Applies repeated correlation if X is vector
    # TODO: deals with non repeated correlations (X * ALPHA)
    if alpha.ndim > 1:
        rxs = repeated_corr(np.sin(alpha), X)
        rxc = repeated_corr(np.cos(alpha), X)
        rcs = np.zeros_like(alpha[0, :])
        rcs = pairwise(np.sin(alpha), np.cos(alpha), func=_loop_corr,
                       n_jobs=-1)
    else:
        # WIP Applies repeated correlation if alpha is vector
        rxs = repeated_corr(X, np.sin(alpha))
        rxc = repeated_corr(X, np.cos(alpha))
        rcs = repeated_corr(np.sin(alpha), np.cos(alpha))

    # Adapted from equation 27.47
    R = (rxc ** 2 + rxs ** 2 - 2 * rxc * rxs * rcs) / (1 - rcs ** 2)

    # JR adhoc way of having a sign....
    R = np.sign(rxs) * np.sign(rxc) * R
    R2 = np.sqrt(R ** 2)

    # Get degrees of freedom
    n = len(X)
    pval = 1 - chi2.cdf(n * R2, 2)

    return R, R2, pval
Exemple #22
0
def update_w(claim, index, count, truth, m, n, eps=1e-15):
    rtn = -np.ones(m)
    for i in range(n):
        rtn[index[i]] = rtn[index[i]] + (claim[i]-truth[i])**2
    rtn[rtn==0] = 1e10
    rtn[rtn>0] = chi2.cdf(0.025, count[rtn>0])/rtn[rtn>0]
    #rtn[rtn>0] = chi2.interval(0.05, count[rtn>0])[0]/rtn[rtn>0]
    return(rtn)
Exemple #23
0
def solve_chi_squared(chi_squared_value=None, f=None, p=None):
    max_1_none(chi_squared_value, f, p)
    if chi_squared_value == None:
        return chi_squared(f,p)
    elif f == None:
        raise NotImplemented("Not implemented yet - sorry")
    elif p == None:
        return sympify(sci_chi2.cdf(float(chi_squared_value), float(f)))
Exemple #24
0
    def compute_ANOVA(self, mu=None, mu_start=0, return_weights=0):

        """
        Returns -2 log likelihood, the pvalue and the maximum likelihood
        estimate for a common mean.

        Parameters
        ----------

        mu : float
            If a mu is specified, ANOVA is conducted with mu as the
            common mean.  Otherwise, the common mean is the maximum
            empirical likelihood estimate of the common mean.
            Default is None.

        mu_start : float
            Starting value for commean mean if specific mu is not specified.
            Default = 0

        return_weights : bool
            if TRUE, returns the weights on observations that maximize the
            likelihood.  Default is FALSE

        Returns
        -------

        res: tuple
            The log-likelihood, p-value and estimate for the common mean.
        """
        if mu is not None:
            llr = self._opt_common_mu(mu)
            pval = 1 - chi2.cdf(llr, self.num_groups - 1)
            if return_weights:
                return llr, pval, mu, self.new_weights
            else:
                return llr, pval, mu
        else:
            res = optimize.fmin_powell(self._opt_common_mu, mu_start,
                                       full_output=1)
            llr = res[1]
            mu_common = float(res[0])
            pval = 1 - chi2.cdf(llr, self.num_groups - 1)
            if return_weights:
                return llr, pval, mu_common, self.new_weights
            else:
                return llr, pval, mu_common
def TS2sigma(TS,dof, quiet=False):
    """ one-sided Chi^2 test """
    pval_1 = chi2.cdf(TS, dof)
    sigma=math.sqrt(2)*sp.erfinv(pval_1)

    if not quiet:
        print "TS=%.2f\t->\t%.2f sigma"%(TS,sigma)
    return sigma
Exemple #26
0
def chi_square_test(sequence, k):
    frequencies = {i: 0 for i in range(k)}
    for element in sequence:
        frequencies[floor(element * k)] += 1
    expected_frequency = float(len(sequence)) / k
    chi_square = sum(
        (frequency - expected_frequency) ** 2 / expected_frequency for frequency in list(frequencies.values()))
    return chi_square, chi2.cdf(chi_square, k)
Exemple #27
0
def block_runs(bitblocks):
    bitblocks = prepare_bitblocks(bitblocks)
    try:
        B, _, K = BNKlookup(bitlength(bitblocks))
    except(IndexError):
        print("Tried (and failed) to count runs in a set that is less than 128 elements.")
    else:
        chi = chi_squared(bitblocks)
        return 1 - chi2.cdf(chi, K)
Exemple #28
0
def reduced_chi_square(xvals, yvals, sigy, func, numparam):
	""" 
	Returns the reduced chi-squared, pvalue, and DOF of the fit.
	"""
	c = 0
	n = len(xvals) - numparam
	for x, y, s in zip(xvals, yvals, sigy):
		 c += (y-func(x))**2/(s**2)
	return c/n, float(1-chi2.cdf(c/n,n)), n 
def NOTM(bits, blocksize=110000, tmpltsz=9):
    template = maketemplate(tmpltsz)
    blocks = partition_bits(bits, blocksize)
    binpow = 1/2**tmpltsz
    theor_mean = binpow*(blocksize - tmpltsz + 1)
    theor_var = blocksize*(binpow - binpow**2*(2*tmpltsz - 1))
    chi = 0
    for block in blocks:
        chi += (find_matches(block, template) - theor_mean)**2 / theor_var
    return 1 - chi2.cdf(chi, 2)
Exemple #30
0
    def combine_exact(self):
        
        # Hypothesis Test:
        # H0(i->j): i can be an ancestor of j
        # HA(i-*>j): i cannot be an ancestor of j

        statistic = np.zeros(self.dim)

        for index in range(self.size):
            
            # H0 specifies that all sampled diff values have + means -->
            # all the observed positive values are not any different than H+ null
            # hypothesis --> they contribute a multiplicative value of 1 to the LR 
            # statistic
            # the observed negative values are where the ratio difference lies, and those
            # will be forced to be generated by mean of 0 under H+ 
 
            mask  =  (self.data[index] < 0).astype(int)
        
            # sum up the (-2ln(x) ) for x corresponding each of 
            # the terms in likelihood ratio
            # put zero where elements where the delta is missing and
            # is filled with place holder
            statistic = statistic + \
                     ((self.data[index]/ self.sigma[index])**2)* \
                     mask * self.isfilled[index]
            
        # sum over isfilled attribute to get the total number
        # of samples available for comparison of a pair of mutations
        totalSamples = sum(self.isfilled)

        # get the list of existing number of available samples across
        # all pairs of mutations
        countRange = map(int, list(np.unique(totalSamples)))
        
        # start by a blank input for pvalue
        self.pvalue = np.zeros(self.dim)

        for value in countRange:
            # for each value in existing set of available samples for a pair
            # assign the p-value for such pairs using exact test formulation
            # for that total sample count
            mask = (totalSamples == value).astype(int)
            if value == 0:
                # if no samples available for a pair of mutations
                # use pvalue of 1 to indicate lack of rejection
                # (insufficient information)
                self.pvalue += mask
                continue
            n = value
            pvalue = np.zeros(self.dim)
            for k in range(1, 1+n):
                pvalue += (1-chi2.cdf(statistic,k)) * \
                          choose(n, k, exact = True) / (2.0 ** n)
            self.pvalue += pvalue * mask
def get_p_value(T):
  # same as scipy.stats.chi2_contingency(T, correction=False)
  det = T[0,0]*T[1,1] - T[0,1]*T[1,0]
  c2 = float(det) / T[0].sum() * det / T[1].sum() * T.sum() / T[:,0].sum() / T[:,1].sum()
  p = 1 - chi2.cdf(x=c2, df=1)
  return p
Exemple #32
0
x, y, err = np.loadtxt(fname, unpack=True)  # Read the data in #
n = len(x)  # No. of data points #

# Initial values of parameters #
p0 = [1.0, 0.01]

f = lambda x, a, b: (a / (x * x)) + b

# Read about lambda notation here : http://www.secnetix.de/olli/Python/lambda_functions.hawk #
p, covm = curve_fit(f, x, y, p0, err)  # Do the fit
a, b = p

chisq = sum(((f(x, a, b) - y) / err)**2)  # Compute chi-squared
ndf = n - len(p)  # no. of degrees of freedom
print("NDF is ", ndf)
Q = 1. - chi2.cdf(chisq,
                  ndf)  # Quality of fit parameter : Q , More the better ! #
chisq = chisq / ndf  # Compute chi-squared per DOF

covm = covm / chisq  # !
aerr, berr = np.sqrt(np.diag(covm))  # Set the error bars

# ======================================
# Print the results #
# p[0] = a ; p[1] = b
print("-----")
print("a = %10.8f +/- %7.8f with %7.4f percent error " % (a, aerr,
                                                          (aerr / a) * 100))
print("b = %10.7f  %7.7f with %7.4f percent error " % (b, berr,
                                                       (berr / b) * 100))
print("chi squared / NDF = %7.4lf" % chisq)
print("CL = %2.2f" % Q)  # Checked that it 1 - pvalue (gnuplot) ; 1 - P
Exemple #33
0
from scipy.stats import chi2
chi_square = sum([(o - e)**2. / e
                  for o, e in zip(Observed_Values, Expected_Values)])
chi_square_statistic = chi_square[0] + chi_square[1]
print("chi-square statistic:-", chi_square_statistic)

# In[38]:

#critical_value
critical_value = chi2.ppf(q=1 - alpha, df=df)
print('critical_value:', critical_value)

# In[39]:

#p-value
p_value = 1 - chi2.cdf(x=chi_square_statistic, df=df)
print('p-value:', p_value)

# In[40]:

print('Significance level: ', alpha)
print('Degree of Freedom: ', df)
print('chi-square statistic:', chi_square_statistic)
print('critical_value:', critical_value)
print('p-value:', p_value)

# In[27]:

#compare chi_square_statistic with critical_value and p-value which is the probability of getting chi-square>0.09 (chi_square_statistic)
if chi_square_statistic >= critical_value:
    print("Reject H0,There is a relationship between 2 categorical variables")
#!/usr/bin/env python
from __future__ import print_function
import pylab as pl
import scipy.optimize
from scipy.stats import chi2

for fa_rate in 1.0/pl.array([1e1, 1e2, 1e4, 1e6, 1e9]):
    print(fa_rate)
    for df in range(1,7):
        f_eq = lambda x: ((1- fa_rate) - chi2.cdf(x, df))**2
        res = scipy.optimize.minimize(f_eq, df)
        assert res['success']
        print('\t', res.x[0])
Exemple #35
0
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
}
ALPHA = 0.05

players = [player for player in Player if get_season(player) in TEST_SEASONS]
train_input = [float(get_age(player)) for player in players]
train_output = [1.0 if get_is_mol(player) else 0.0 for player in players]
non_mol = np.array(
    [data for data, label in zip(train_input, train_output) if label == 0.0])
mol = np.array(
    [data for data, label in zip(train_input, train_output) if label == 1.0])

non_mol_kde = InnerAppearanceLayer.kernel_density_estimation(non_mol)
non_mol_bandwidth = silverman_bandwidth(np.array(non_mol))
non_mol_points = len(non_mol)
mol_kde = InnerAppearanceLayer.kernel_density_estimation(mol)
mol_bandwidth = silverman_bandwidth(np.array(mol))
mol_points = len(mol)

ages = [float(age) for age in range(20, 59)]
statistic_values = [
    test_statistic(age, non_mol_kde, non_mol_bandwidth, non_mol_points,
                   mol_kde, mol_bandwidth, mol_points) for age in ages
]
p_values = [chi2.cdf(x, 1) for x in statistic_values]
p_order = sorted(p_values)
hochman = [ALPHA / (len(p_order) - j + 1) for j in range(1, len(p_order) + 1)]
indices = [
    i for i, pair in enumerate(zip(p_order, hochman)) if pair[0] <= pair[1]
]
print(indices)
Exemple #36
0
def chisq_test(O, E, degree=3, sig_level=0.05):
    measured_val = sum( [(o - e)**2/e for (o, e) in zip(O, E)] )
    return chi2.cdf(measured_val, degree), chi2.sf(measured_val, degree)
Exemple #37
0
 def _get_p_value(self) -> None:
     self.p_value = 1 - chi2.cdf(x=self.chi_square_statistic,
                                 df=self.degree_of_freedom)
Exemple #38
0
def get_pvalue(gene, data, ratio=0.5, isplot=True):
    lifespan_table = pd.DataFrame(index=['visit7', 'visit8', 'visit9', 'visit10'],
                                  columns=['obs_num1', 'prob1', 'expt_num1', 'obs_num2', 'expt_num2', 'prob2', 'obs_num'])

    gene_median = np.median(data.loc[:, gene])
    group_high = data.loc[data.loc[:, gene] >= gene_median]
    group_low = data.loc[data.loc[:, gene] < gene_median]

    # cutoff = np.quantile(data.iloc[:, 0:4], quantile)
    cutoff = np.min(np.min(data.iloc[:, 0:4])) * ratio
    n1 = 9  # sample num in each group
    n2 = 10  # sample num in each group
    n = n1 + n2
    # ---visit7---
    obs_num1 = np.size(np.where(group_high.loc[:, 'visit07_L*'] < cutoff))
    obs_num2 = np.size(np.where(group_low.loc[:, 'visit07_L*'] < cutoff))
    prob1 = obs_num1 / n1
    prob2 = obs_num2 / n2

    obs_num = obs_num1 + obs_num2
    expt_num1 = obs_num*(n1/n)
    expt_num2 = obs_num*(n2/n)
    lifespan_table.loc['visit7', :] = [obs_num1, prob1, expt_num1, obs_num2, expt_num2, prob2, obs_num]

    # ---visit8---
    obs_num1 = np.size(np.where(group_high.loc[:, 'visit08_L*'] < cutoff))
    obs_num2 = np.size(np.where(group_low.loc[:, 'visit08_L*'] < cutoff))
    prob1 = obs_num1 / n1
    prob2 = obs_num2 / n2
    obs_num = obs_num1 + obs_num2
    expt_num1 = obs_num*(n1/n)
    expt_num2 = obs_num*(n2/n)
    lifespan_table.loc['visit8', :] = [obs_num1, prob1, expt_num1, obs_num2, expt_num2, prob2, obs_num]

    # ---visit9---
    obs_num1 = np.size(np.where(group_high.loc[:, 'visit09_L*'] < cutoff))
    obs_num2 = np.size(np.where(group_low.loc[:, 'visit09_L*'] < cutoff))
    prob1 = obs_num1 / n1
    prob2 = obs_num2 / n2
    obs_num = obs_num1 + obs_num2
    expt_num1 = obs_num*(n1/n)
    expt_num2 = obs_num*(n2/n)
    lifespan_table.loc['visit9', :] = [obs_num1, prob1, expt_num1, obs_num2, expt_num2, prob2, obs_num]

    # ---visit10---
    obs_num1 = np.size(np.where(group_high.loc[:, 'visit10_L*'] < cutoff))
    obs_num2 = np.size(np.where(group_low.loc[:, 'visit10_L*'] < cutoff))
    prob1 = obs_num1 / n1
    prob2 = obs_num2 / n2
    obs_num = obs_num1 + obs_num2
    expt_num1 = obs_num*(n1/n)
    expt_num2 = obs_num*(n2/n)
    lifespan_table.loc['visit10', :] = [obs_num1, prob1, expt_num1, obs_num2, expt_num2, prob2, obs_num]

    O1 = lifespan_table.loc[:, 'obs_num1'].sum()
    O2 = lifespan_table.loc[:, 'obs_num2'].sum()
    E1 = lifespan_table.loc[:, 'expt_num1'].sum()
    E2 = lifespan_table.loc[:, 'expt_num2'].sum()

    X = np.power(O1-E1, 2)/E1 + np.power(O2-E2, 2)/E2
    p_value = 1 - chi2.cdf(X, df=1)
    print(gene, p_value)
    if p_value < 0.05 and isplot:
        prob1 = lifespan_table.loc[:, 'prob1']
        prob2 = lifespan_table.loc[:, 'prob2']
        draw_surv_plot(prob1, prob2, gene, p_value, ratio)
        draw_box_plot(data, gene, ratio)

    return p_value
Exemple #39
0
def test_count(data, opts):
    """
    Make a test for all genes iteratively.

    @args data: Store all input data and results
    @type data: Class object
    @args opts: Input argument to the main TE function 
    @type opts: Instance
    """

    print 'Start the statistical test.'

    num = len(data.geneIDs)
    pval = np.empty((num, 1))
    pval.fill(np.nan)

    explanatory0 = cm.create_matrix(data, model='H0')
    explanatory1 = cm.create_matrix(data, model='H1')
    librarySizes = np.hstack([data.libSizesRibo, data.libSizesRna])

    lenSampleRibo = data.idxRibo.size
    lenSampleRna = data.idxRna.size

    errorCnt = 0

    for i in range(num):
        sys.stdout.flush()

        if i % 50 == 0:
            print '\r%i genes finished...' % i,
        if i + 1 == num:
            print '\r%i genes finished.' % num

        if opts.dispDiff and np.isnan(data.dispAdjRibo[i]):
            continue
        if not opts.dispDiff and np.isnan(data.dispAdj[i]):
            continue

        response = np.hstack([data.countRibo[i, :], data.countRna[i, :]])

        if opts.dispDiff:
            disp = np.hstack([
                np.repeat(data.dispAdjRibo[i], lenSampleRibo),
                np.repeat(data.dispAdjRna[i], lenSampleRna)
            ])
        else:
            disp = data.dispAdj[i]

        try:
            modNB0 = sm.GLM(response,
                            explanatory0,
                            family=sm.families.NegativeBinomial(alpha=disp),
                            offset=np.log(librarySizes))
            modNB1 = sm.GLM(response,
                            explanatory1,
                            family=sm.families.NegativeBinomial(alpha=disp),
                            offset=np.log(librarySizes))
            result0 = modNB0.fit()
            result1 = modNB1.fit()
        except sm.tools.sm_exceptions.PerfectSeparationError:
            errorCnt += 1
        else:
            if not opts.dispDiff:
                pval[i] = 1 - chi2.cdf(
                    result0.deviance - result1.deviance,
                    explanatory1.shape[1] - explanatory0.shape[1])
            elif opts.dispDiff:
                pval[i] = 1 - chi2.cdf(
                    result0.deviance - result1.deviance,
                    (explanatory1.shape[1] - explanatory0.shape[1]) / 2.5)
            else:
                pass

    data.pval = pval

    sys.stdout.write(
        'Warning: Failed to do test: %i genes. P value set to \'nan\'.\n' %
        errorCnt)

    return data
Exemple #40
0
    def compare(self, contrast, alpha=0.05):
        """Compare predictions given a contrast

        If there are two predictions, you can compare as:
            roc.compare(contrast=[1, -1], alpha=0.05)
        """

        # Validate alpha
        if (alpha <= 0) | (alpha >= 1):
            raise ValueError('alpha must be in the range (0, 1), exclusive.')
        elif alpha > 0.5:
            alpha = 1 - alpha

        # Verify if covariance was calculated
        if self.S is None:
            self._calculate_covariance()

        # L as matrix
        L = np.array(contrast, dtype=float)
        if len(L.shape) == 1:
            L = L.reshape(1, L.shape[0])

        # Shapes
        L_sz = L.shape
        S_sz = self.S.shape

        # is not equal to number of classifiers
        if (S_sz[1] != L_sz[1]):  # Contrast column
            raise ValueError(
                'Contrast should have %d elements (number of predictors)',
                S_sz[1])

        # Validate contrast
        if np.any(np.sum(L, axis=1) != 0):
            raise ValueError('Contrast rows must sum to 0', S_sz[1])

        # Calculate LSL matrix
        LSL = L @ self.S @ np.transpose(L)

        # Normal vs chi^2 distribution
        if L_sz[0] == 1:
            # Compute using the normal distribution
            mu = L @ np.transpose(self.auc)
            sigma = np.sqrt(LSL)
            thetaP = norm.cdf(0, mu, sigma)

            # 2-sided test, double the tails -> double the p-value
            if mu < 0:
                thetaP = 2 * (1 - thetaP)
            else:
                thetaP = 2 * thetaP

            # Confidence intervals
            theta2 = norm.ppf([alpha / 2, 1 - alpha / 2], mu, sigma)
        else:
            # Calculate chi2 stat with DOF = rank(L*S*L')
            # first invert the LSL matrix
            inv_LSL = np.linalg.inv(LSL)

            # then calculate the chi2
            w_chi2 = self.auc @ np.transpose(L) @ inv_LSL @ L @ np.transpose(
                self.auc)
            w_df = np.linalg.matrix_rank(np.transpose(LSL))
            thetaP = 1 - chi2.cdf(w_chi2, w_df)
            theta2 = chi2.ppf([alpha / 2, 1 - alpha / 2], w_df)

        return np.ndarray.item(thetaP), theta2
Exemple #41
0
5. P-VALUE: PASAR O NO EL TEST 
La probalidad de encontrar resultados tan extremos
 (valores de la derecha, positivos)
Probabilidad condicional: 
    Encontrar valores extremos dado que la distribución es normal
MAL: Proba de que sea normal dado el valor que obtuve

'''
x_mean = np.mean(x)
x_std = np.std(x)
x_skew = skew(x)
x_kurtosis = kurtosis(x)  #Kurtosis en exceso k-3
x_jb_stat = nb_sim / 6 * (x_skew**2 + 1 / 4 * x_kurtosis**2)
#Que tan lejos estas de la normalidad
#Necesariamente chico
p_value = 1 - chi2.cdf(x_jb_stat, df=2)
#Se distribuye chi2 con 2 grados de libertad
#Si valor–p < nivel de significación => Rechazo H0.
#Si valor–p > nivel de significación => No rechazo H0.
x_is_normal = (p_value > 0.05)  #equivalente a jb <6

print('skewness is ' + str(x_skew))
print('kurtosis is ' + str(x_kurtosis))
print('Jarque-Bera statistic is ' + str(x_jb_stat))
print('p-value is ' + str(p_value))
print('is normal ' + str(x_is_normal))

#jb_list = []
#jb_list.appennd(x_jb_stat)

#Plot histogram
def result_cross(F, Y, T, N, Q):
    Lambda, SD, CV, p_value = test_statistic_cross(F, Y, T, N, Q)
    Lambda_1 = (N - Q) * Lambda
    p_value_1 = 1 - chi2.cdf(Lambda_1, N - Q)
    return (Lambda, p_value, Lambda_1, p_value_1)
Exemple #43
0
#%%
import numpy as np
from scipy.stats import chi2
from scipy.stats import chisquare
import matplotlib.pyplot as plt

df = 3
rv = chi2(df)

vals = chi2.ppf([0.001, 0.5, 0.999], df)
np.allclose([0.001, 0.5, 0.999], chi2.cdf(vals, df))

stat = [101, 189, 317, 393]
e = [100, 200, 300, 400]

k2 = 0
for i in range(len(e)):
    k2 += (stat[i] - e[i])**2 / e[i]
    
print(k2)
print(chisquare(stat, f_exp=e))

p = 1 - chi2.cdf(k2, df)
print(p)

x = np.linspace(0, 10, 1000)
alpha = 0.05
k2 = chisquare(stat, f_exp=e)[:1][0]
y = chi2.sf(x, df)
plt.plot(x, y)
plt.hlines(alpha, 0, 10, colors='r')
Exemple #44
0
from scipy import stats
from scipy.stats import t as t_dist
from scipy.stats import chi2

from abtesting_test import *
import math

# You can comment out these lines! They are just here to help follow along to the tutorial.
print(t_dist.cdf(-2, 20))  # should print .02963
print(t_dist.cdf(
    2, 20))  # positive t-score (bad), should print .97036 (= 1 - .2963)

print(chi2.cdf(23.6, 12))  # prints 0.976
print(1 - chi2.cdf(23.6, 12))  # prints 1 - 0.976 = 0.023 (yay!)

# TODO: Fill in the following functions! Be sure to delete "pass" when you want to use/run a function!
# NOTE: You should not be using any outside libraries or functions other than the simple operators (+, **, etc)
# and the specifically mentioned functions (i.e. round, cdf functions...)


def slice_2D(list_2D, start_row, end_row, start_col, end_col):
    '''
    Splices a the 2D list via start_row:end_row and start_col:end_col
    :param list: list of list of numbers
    :param nums: start_row, end_row, start_col, end_col
    :return: the spliced 2D list (ending indices are exclsive)
    '''
    to_append = []
    for l in range(start_row, end_row):
        to_append.append(list_2D[l][start_col:end_col])
Exemple #45
0
    def get_filtered_VariantsLogRatio(self):
        iter = 0

        #get most abundant base
        self.maxA = np.argmax(self.freq, axis=1)
        #get second most abundant base

        ftemp = np.copy(self.freq)
        for v in range(self.V):
            ftemp[v, self.maxA[v]] = -1
        self.maxB = np.argmax(ftemp, axis=1)

        N = (self.freq.sum(axis=1)).astype(np.float64)
        n = (self.freq.max(axis=1)).astype(
            np.float64)  #value of most abundant base
        m = (ftemp.max(axis=1)).astype(
            np.float64)  #value of second most abundant
        e = N - n
        self.filtered = N < self.Nthreshold
        self.minV = np.zeros(self.V)
        self.minV[self.filtered == False] = m[self.filtered == False] / N[
            self.filtered == False]

        p = np.zeros(self.V)
        MLL = np.zeros(self.V)
        lastSelect = 0
        Select = self.V
        p[self.filtered ==
          False] = n[self.filtered == False] / N[self.filtered == False]
        p[p > self.upperP] = self.upperP
        while iter < self.max_iter and lastSelect != Select:
            #filter based on current error rate

            BLL = -(log(self.eta[self.maxA, :]) * self.freq).sum(axis=1)

            for v in range(self.V):
                if self.optimise:
                    res = minimize_scalar(mixNLL,
                                          bounds=(0.0, self.upperP),
                                          args=(self.eta, self.maxA[v],
                                                self.maxB[v],
                                                self.ffreq[v, :]),
                                          method='bounded')
                    p[v] = res.x
                MLL[v] = mixNLL(p[v], self.eta, self.maxA[v], self.maxB[v],
                                self.ffreq[v, :])

            ratioNLL = 2.0 * (BLL - MLL)

            self.filtered = np.logical_or(N < self.Nthreshold,
                                          ratioNLL < self.threshold)

            ff = self.freq[self.filtered]
            af = self.maxA[self.filtered]
            Nf = N[self.filtered]

            self.eta = 96 * np.identity((4)) + np.ones((4, 4))

            for v in range(Nf.shape[0]):
                self.eta[af[v], :] += ff[v, :]

            esums = self.eta.sum(axis=1)
            self.eta = self.eta / esums[:, np.newaxis]

            lastSelect = Select
            Select = self.V - self.filtered.sum()
            logging.info("Variant filter iter: " + str(iter) + " " +
                         str(Select) + " " + str(self.eta))
            sys.stdout.flush()
            iter = iter + 1

        self.pvalue = 1.0 - chi2.cdf(ratioNLL, 1)
        self.qvalue = benjamini_Hochberg(self.pvalue)
        self.ratioNLL = ratioNLL
        self.filtered = np.logical_or(N < self.Nthreshold,
                                      self.qvalue > self.qvalue_cutoff)

        self.snps_filter = self.snps_filter[self.filtered != True, :, :]
        self.selected_indices = list(np.where(self.filtered != True))
        self.selected_indices = self.selected_indices[0].tolist()
        self.selected = self.filtered != True
        self.NS = self.snps_filter.shape[0]
        return self.snps_filter
def chi_sq(t, x, err, f, n, df):
    chi_sq = sum((x - f(t, n))**2/err**2)
    p_value = 1 - chi2.cdf(chi_sq, df)
    return chi_sq, p_value
Exemple #47
0
                   mRNA,
                   samples=25,
                   plot=False)

    # calculate v
    v0 = sum(res0**2)
    vA = sum(resA**2)
    vC = sum(resC**2)
    vD = sum(resD**2)
    # calculate dw
    dw0 = durbin_watson(res0)
    dwA = durbin_watson(resA)
    dwC = durbin_watson(resC)
    dwD = durbin_watson(resD)
    # p-values based on explicit testing
    chi2p0 = 1 - chi2.cdf(v0, n - 1)
    chi2pA = 1 - chi2.cdf(vA, n - 2)
    chi2pC = 1 - chi2.cdf(vC, n - 3)
    chi2pD = 1 - chi2.cdf(vD, n - 4)
    # explicit testing
    # rejection because of chi2 test
    r0_ex = chi2p0 < 0.05
    rA_ex = chi2pA < 0.05
    rC_ex = chi2pC < 0.05
    rD_ex = chi2pD < 0.05
    # get lower limit for dw-test (explicit)
    dL0 = float(dw[(dw.n == n) & (dw.m == 2)].dL)
    dLA = float(dw[(dw.n == n) & (dw.m == 2)].dL)
    dLC = float(dw[(dw.n == n) & (dw.m == 3)].dL)
    if n == 7:
        dLD = float(dw[(dw.n == n) & (dw.m == 3)].dL)
Exemple #48
0
y_data = log_MF[ok]
y_data_err = (data["std90_pc_cen"][ok]**2. + data["dN_counts_cen"][ok]**(-1.))**(0.5)

#p.hist(n.log10(y_data_err), bins=10)
#p.show()

ps = n.array([0.333, 0.794, 0.247])
log_fsigma = lambda logsigma, A, a, p : n.log10(lib.f_BH(10**-logsigma, A, a, p, 1.))
print "ST01 fit ----------------------"
pOpt, pCov=curve_fit(log_fsigma, x_data, y_data, ps, sigma = y_data_err, maxfev=50000000)#, bounds=boundaries)
chi2 = n.sum(((log_fsigma(x_data, pOpt[0], pOpt[1], pOpt[2])-y_data)/y_data_err)**2. ) 
ndof = (len(x_data) - len(ps)) 
print "best params=", pOpt
print "err=", pCov.diagonal()**0.5
print "chi2 ", chi2, ndof, chi2/ndof
print "P chi2 1-cdf", 1-stc2.cdf(int(chi2),ndof)
print "---------------------------------------------------"
pOpt_ST01 = pOpt
pErr_ST01 = pCov.diagonal()**0.5

n.savetxt(join(os.environ['MVIR_DIR'],"mvirFunction_parameters_ST01_MFonly_fit.txt"), n.transpose([pOpt_ST01, pErr_ST01]), header="A a p")

print "BATT 2011"
ps = n.array([A0, a0, p0, q0])
log_fsigma = lambda logsigma, A, a, p, q : n.log10(lib.f_BH(10**-logsigma, A, a, p, q))

pOpt, pCov=curve_fit(log_fsigma, x_data, y_data, ps, sigma = y_data_err, maxfev=50000000)#, bounds=boundaries)
chi2 = n.sum(((log_fsigma(x_data, pOpt[0], pOpt[1], pOpt[2], pOpt[3])-y_data)/y_data_err)**2. ) 
ndof = (len(x_data) - len(ps)) 
print "best params=", pOpt
print "err=", pCov.diagonal()**0.5
def gal_smass(id, i, ierr, gr, ri, iz, gre, rie, ize, zed, splines, zmet,
              galaxy):

    print galaxy, "  at z = ", zed
    cd = CosmologicalDistance.CosmologicalDistance()
    ldistDict = dict()
    rest_gr, rest_gi, weight = [], [], []
    masslight, sfrs, ages, zmets, kii, kri = [], [], [], [], [], []
    minChiSq = 999
    spIndex = -1
    for sp in range(0, len(splines)):
        # for speed
        skey = str(sp) + "-" + str(zed)
        sgr = splines[sp][0](zed)
        sri = splines[sp][1](zed)
        siz = splines[sp][2](zed)
        sgrr = splines[sp][4](zed)
        # restframe g-r
        sgir = splines[sp][5](zed)
        # restframe g-i
        skii = splines[sp][6](zed)
        # kcorrection: i_o - i_obs
        skri = splines[sp][7](zed)
        # kcorrection: r_o - i_obs
        sml = splines[sp][8](zed)
        # log(mass/light)  (M_sun/L_sun)
        ssfr = splines[sp][9](zed)
        sage_cosmic = splines[sp][10](zed)
        sage = splines[sp][11](zed)
        szmet = zmet[sp]
        #To be changed if SFH changes

        gr_chisq = pow((gr - sgr) / gre, 2)
        ri_chisq = pow((ri - sri) / rie, 2)
        iz_chisq = pow((iz - siz) / ize, 2)
        rest_gr.append(sgrr)
        rest_gi.append(sgir)
        kii.append(skii)
        kri.append(skri)
        masslight.append(sml)
        sfrs.append(ssfr)
        ages.append(sage)
        zmets.append(szmet)
        chisq = gr_chisq + ri_chisq + iz_chisq
        probability = 1 - chi2.cdf(chisq, 3 - 1)
        # probability of chisq greater than this
        weight.append(probability)
    spIndex = np.argmax(weight)
    rest_gr = np.array(rest_gr)
    rest_gi = np.array(rest_gi)
    kii = np.array(kii)
    kri = np.array(kri)
    masslight = np.array(masslight)
    sfrs = np.array(sfrs)
    ages = np.array(ages)
    weight = np.array(weight)
    gr_weighted = rest_gr * weight
    gi_weighted = rest_gi * weight
    kii_weighted = kii * weight
    kri_weighted = kri * weight
    masslight_weighted = masslight * weight
    sfr_weighted = sfrs * weight
    age_weighted = ages * weight
    zmet_weighted = zmets * weight
    w1 = weight.sum()
    w2 = (weight**2).sum()
    if w1 == 0: w1 = 1e-10
    if w2 == 0: w2 = 1e-10
    mean_gr = gr_weighted.sum() / w1
    mean_gi = gi_weighted.sum() / w1
    mean_kii = kii_weighted.sum() / w1
    mean_kri = kri_weighted.sum() / w1
    mean_masslight = masslight_weighted.sum() / w1
    mean_sfr = sfr_weighted.sum() / w1
    mean_age = age_weighted.sum() / w1
    mean_zmet = zmet_weighted.sum() / w1
    # unbiased weighted estimator of the sample variance
    w3 = w1**2 - w2
    if w3 == 0: w3 = 1e-10
    var_gr = (w1 / w3) * (weight * (rest_gr - mean_gr)**2).sum()
    var_gi = (w1 / w3) * (weight * (rest_gi - mean_gi)**2).sum()
    var_kii = (w1 / w3) * (weight * (kii - mean_kii)**2).sum()
    var_kri = (w1 / w3) * (weight * (kri - mean_kii)**2).sum()
    var_masslight = (w1 / w3) * (weight *
                                 (masslight - mean_masslight)**2).sum()
    var_sfr = (w1 / w3) * (weight * (sfrs - mean_sfr)**2).sum()
    var_age = (w1 / w3) * (weight * (ages - mean_age)**2).sum()
    var_zmet = (w1 / w3) * (weight * (zmets - mean_zmet)**2).sum()
    std_gr = var_gr**0.5
    std_gi = var_gi**0.5
    std_kii = var_kii**0.5
    std_kri = var_kri**0.5
    std_masslight = var_masslight**0.5
    std_sfr = var_sfr**0.5
    std_age = var_age**0.5
    std_zmet = var_zmet**0.5
    if std_gr > 99.99: std_gr = 99.99
    if std_gi > 99.99: std_gi = 99.99
    if std_kii > 99.99: std_kii = 99.99
    if std_kri > 99.99: std_kri = 99.99
    if std_sfr > 99.99: std_sfr = 99.99
    if std_age > 99.99: std_age = 99.99
    if std_masslight > 99.99: std_masslight = 99.99
    if std_zmet > 99.99: std_zmet = 99.99
    # Comment -distanceModulus out for fsps versions <2.5, as their mags don't include distance modulus
    if zed in ldistDict:
        lumdist = ldistDict[zed]
    else:
        lumdist = cd.luminosity_distance(zed)
        # in Mpc
        ldistDict[zed] = lumdist
    distanceModulus = 5 * np.log10(lumdist * 1e6 / 10.)
    iabs = i + mean_kii - distanceModulus
    rabs = i + mean_kri - distanceModulus
    taMass = taylorMass(mean_gi, iabs)
    mcMass = mcintoshMass(mean_gr, rabs)
    fsMass = fspsMass(mean_masslight, iabs)
    # JTA: to make purely distance modulus
    #iabs = i[galaxy] - distanceModulus
    #fsMass = gstarMass( iabs )

    # saving for output
    # perhaps: out_id[galaxy] = id[galaxy]

    return [id, zed, mean_gr , std_gr, mean_gi, std_gi, mean_kii, std_kii, mean_kri, std_kri , i, distanceModulus , iabs,\
    rabs, mcMass, taMass, fsMass, std_masslight,spIndex,zmets[spIndex],mean_sfr,std_sfr, mean_age, std_age, mean_zmet, std_zmet ]
Exemple #50
0
def Doornik_Hansen_MVN_test(X, verbose=1):
    print("==========================================")
    print("Doornik-Hansen Multivariate Normality Test")
    print("------------------------------------------")
    n = X.shape[0]
    k = X.shape[1]
    print("the number of dimesions    :", k)
    print("the number of sample points:", n)
    print("------------------------------------------")
    # === x_avg =======================================
    x_avg = (1 / n) * np.sum(X, axis=0)
    x_avg = x_avg.reshape((k, 1))
    # === inverse of sample_covariance_matrix =========
    S = np.cov(X.T)
    # for numpy function np.cov(M),
    # each row of M represents a variable (dimension)
    # each column of M represent a single observation
    # thus, transpose required
    S_inv = np.linalg.inv(S)  # inverse of S
    # === V and C = VSV (correlation matrix) ==========
    D = np.sqrt(np.diag(S))
    D = np.diagflat(D)
    V = np.linalg.inv(D)
    C = np.dot(np.dot(V, S), V)  # correlation matrix
    # === eigenvectors and eignevalues of C
    eig_val, eig_vec = np.linalg.eig(C)
    L = np.diagflat(eig_val**-0.5)
    H = eig_vec.T  # columns are eigenvectors
    # === transformed X
    X_center = X - x_avg.T
    dot_prod = np.dot(X_center, V)
    dot_prod = np.dot(dot_prod, H)
    dot_prod = np.dot(dot_prod, L)
    X_transf = np.dot(dot_prod, H.T)
    # === skewness and kurtosis of transformed X
    skewn_list = []
    kurto_list = []
    for dimension in range(k):
        x = X_transf[:, dimension]
        skewn = skew(x)
        kurto = kurtosis(x, fisher=False)  # Pearson definition
        skewn_list.append(skewn)
        kurto_list.append(kurto)
    # === z1 and z2 ==================================
    z1_list = []
    z2_list = []
    for p in range(k):
        b1 = (skewn_list[p])**2
        b2 = kurto_list[p]
        beta = (3 * (n**2 + 27 * n - 70) * (n + 1) *
                (n + 3)) / ((n - 2) * (n + 5) * (n + 7) * (n + 9))
        ohm_2 = -1 + (2 * (beta - 1))**0.5
        delta1 = (np.log10(ohm_2**0.5))**(-0.5)
        y = ((b1 * (ohm_2 - 1) * (n + 1) * (n + 3)) / (12 * (n - 2)))**(0.5)
        z1 = delta1 * (np.log10(y + (1 + y**2)**0.5))
        z1_list.append(z1)
        # -------------------------------------------------------------
        delta2 = (n - 3) * (n + 1) * (n**2 + 15 * n - 4)
        a = ((n - 2) * (n + 5) * (n + 7) * (n**2 + 27 * n - 70)) / (6 * delta2)
        c = ((n - 7) * (n + 5) * (n + 7) * (n**2 + 2 * n - 5)) / (6 * delta2)
        f = ((n + 5) * (n + 7) * (n**3 + 37 *
                                  (n**2) + 11 * n - 313)) / (12 * delta2)
        alpha = a + b1 * c
        chi = 2 * f * (b2 - 1 - b1)
        z2 = ((9 * alpha)**0.5) * ((chi / (2 * alpha))**(1 / 3) - 1 +
                                   (1 / (9 * alpha)))
        z2_list.append(z2)
    Z1 = np.array(z1_list).reshape(k, 1)  # vertical vector
    Z2 = np.array(z2_list).reshape(k, 1)  # vertical vector
    # === chi2 prob ==================================
    statistic = (np.dot(Z1.T, Z2) + np.dot(Z2.T, Z2))  #[0][0]
    chi2_df = 2 * k
    p_chi2 = 1 - chi2.cdf(statistic, chi2_df)
    print("statistic: %.4f" % statistic)
    print("p-val    : %.4f" % p_chi2)
    print("------------------------------------------")
    print("If p-val < alpha (0.05), reject H0.")
    print("Note that H0 is multi-normality,")
    print("larger p-val indicate multi-normality")
    print("==========================================")
    return None
Exemple #51
0
def run_site_branch(cluster_name, treefile, alignment, folder_temp, folder_plots):
    from ete2 import EvolTree
    from ete2.treeview.layouts import evol_clean_layout
    import os
    from collections import defaultdict
    import math
    from scipy.stats import chi2

    print "Processing cluster: " + cluster_name

    tree = EvolTree(treefile)
    tree.link_to_alignment(alignment, alg_format="fasta", nucleotides=True)

    #Create temporal folder
    temp_cluster_folder = folder_temp + "/" + cluster_name

    if not os.path.exists(temp_cluster_folder):
        os.makedirs(temp_cluster_folder)

    tree.workdir = temp_cluster_folder

    #Run M0 as the null model
    tree.run_model("M0")

    #Look at the site selection on each branch

    printed_tree = 0

    i = 0

    #Output list with the results
    output_list = []

    for node in tree.iter_descendants():

        #Mark the tree for the leaf under analysis
        tree.mark_tree([node.node_id], marks=["#1"])

        #Use the node id as folder name
        temp_leaf_name = str(node.node_id)

        print "Processing: " + cluster_name + " " + temp_leaf_name + " " + ",".join(node.get_leaf_names())

        #Run computation of each model.
        #From the notes on ETE:
        # to organize a bit, we name model with the name of the marked node
        # any character after the dot, in model name, is not taken into account
        # for computation. (have a look in /tmp/ete2.../bsA.. directory)

        tree.run_model("bsA." + temp_leaf_name)
        tree.run_model("bsA1." + temp_leaf_name)

        bsA = tree.get_evol_model("bsA." + temp_leaf_name)
        bsA1 = tree.get_evol_model("bsA1." + temp_leaf_name)

        ps_sites = defaultdict()
        total_sites = 0
        sites_over_95 = 0

        for s in range(len(bsA.sites['BEB']['aa'])):
            p_value_site = float(bsA.sites['BEB']['p2'][s])

            if p_value_site > 0.50:
                ps_sites[s] = [bsA.sites['BEB']['aa'][s], bsA.sites['BEB']['p2'][s]]
                total_sites += 1

                if p_value_site > 0.95:
                    sites_over_95 += 1

        #ps = float(tree.get_most_likely("bsA." + temp_leaf_name, "bsA1." + temp_leaf_name))
        rx = float(tree.get_most_likely("bsA1." + temp_leaf_name, "M0"))

        lrt_value = 2 * math.fabs(bsA1.lnL - bsA.lnL)  # LRT test value
        ps = 1 - chi2.cdf(lrt_value, 1)  # p-value based on chi-square

        test_status = None

        #Evidence of positive selection in the branch
        omega_value = float(bsA.classes['foreground w'][2])
        proportion_sites = float(bsA.classes['proportions'][2])

        #Plot file
        plot_file = folder_plots + "/" + cluster_name

        if ps < 0.05 and omega_value > 1:
            #Save plots, both in jpg and svg of the clusters with evidence of positive selection
            test_status = "Positive"

            if printed_tree == 0:
                #tree.render(plot_file + ".svg", layout=evol_clean_layout)
                #tree.render(plot_file + ".jpg", layout=evol_clean_layout)
                printed_tree = 1

            else:
                continue

        elif rx < 0.05 and ps >= 0.05:
            test_status = "Relaxed"

        else:
            #print "no signal"
            test_status = None

        #Remove marks on the tree
        tree.mark_tree(map(lambda x: x.node_id, tree.get_descendants()), marks=[''] * len(tree.get_descendants()),
                       verbose=False)

        result_entry = [cluster_name, node.node_id, omega_value, proportion_sites, ps, test_status,
                        total_sites, sites_over_95, ",".join(node.get_leaf_names())]

       # print result_entry
        #print ps_sites
        #node_results[node.node_id] = [result_entry, ps_sites]
        node_result = [result_entry, ps_sites]

        output_list.append(node_result)

    return output_list
Exemple #52
0
def makePrunedSubtrees(remainingAttributes, examples, attributeValues,
                       className, defaultLabel, setScoreFunc, gainFunc, q):
    """
    Creates a classification tree Node and all its children. This returns a Node, which is the root
    Node of the tree constructed from the passed in parameters. This should be implemented recursively,
    and handle base cases for zero examples or remainingAttributes as covered in the book.

    Args:
        remainingAttributes (list<string>): the names of attributes still not used
        examples (list<dictionary<str,str>>): list of examples
        attrValues (dictionary<string,list<string>>): list of possible values for attribute
        className (str): the name of the class
        defaultLabel (string): the default label
        setScoreFunc (func): the function to score classes (ie classEntropy or gini)
        gainFunc (func): the function to score gain of attributes (ie entropyGain or giniGain)
        q (float): the Chi-Squared pruning parameter
    Returns:
        Node or LeafNode
        The classification tree node optimal for the remaining set of attributes.
    """
    # YOUR CODE HERE (Extra Credit)
    if len(examples) == 0:
        return LeafNode(defaultLabel)
    else:
        matching = True
        currentClassification = examples[0][className]
        for dictionary in examples:
            if dictionary[className] != currentClassification:
                matching = False
        if matching:
            return LeafNode(currentClassification)
    if len(remainingAttributes) == 0:
        classCounts = getClassCounts(examples, className)
        currentMax = classCounts[classCounts.keys()[0]]
        currentFeature = classCounts.keys()[0]
        for key in classCounts.keys():
            if classCounts[key] > currentMax:
                currentMax = classCounts[key]
                currentFeature = key
        return LeafNode(currentFeature)
    else:
        maxScore = float(
            gainFunc(examples, remainingAttributes[0],
                     attributeValues[remainingAttributes[0]], className))
        bestAttribute = remainingAttributes[0]
        for attribute in remainingAttributes:
            currentScore = float(
                (gainFunc(examples, attribute, attributeValues[attribute],
                          className)))
            if abs(currentScore) > abs(maxScore):
                bestAttribute = attribute
                maxScore = currentScore
        node = Node(bestAttribute)

        # a list of all attributesValues we're splitting on
        bestAttributeValues = attributeValues[bestAttribute]

        # make list of class values
        classificationValues = []
        for dictionary in examples:
            for key in dictionary.keys():
                if key == className and not (dictionary[key]
                                             in classificationValues):
                    classificationValues.append(dictionary[key])
        classCountsForExamples = getClassCounts(examples, className)
        # chi statistic
        sum = 0.0
        # go through splitValues
        for splitValue in bestAttributeValues:
            # split on values
            subset = getPertinentExamples(examples, bestAttribute, splitValue)
            classCountsForSubset = getClassCounts(subset, className)

            # go through all class values
            for classValue in classificationValues:

                if classValue in classCountsForExamples.keys():
                    numberOfValueInExamples = classCountsForExamples[
                        classValue]
                    # print "number in example ", numberOfValueInExamples
                    probabilityOfValueInExamples = float(
                        numberOfValueInExamples) / float(len(examples))

                else:
                    probabilityOfValueInExamples = 0.0

                if classValue in classCountsForSubset.keys():
                    numberOfValueInSplit = classCountsForSubset[classValue]
                    # print "number in split ", numberOfValueInSplit
                    probabilityOfValueInSplit = float(numberOfValueInSplit)
                else:
                    probabilityOfValueInSplit = 0.0

                p_hat = probabilityOfValueInExamples * float(len(subset))

                if p_hat != 0.0:
                    currentSum = float(
                        ((abs(p_hat - probabilityOfValueInSplit))**2) / p_hat)
                    sum += currentSum

        # this is v
        degreesOfFreedom = (len(bestAttributeValues) - 1
                            )  # * (len(classificationValues) - 1)
        p_value_chi = 1 - chi2.cdf(sum, degreesOfFreedom)
        # print p_value_chi
        ##print sum

        # p_value_chi = chisqprob(sum, degreesOfFreedom)
        # print p_value_chi
        # print q

        if p_value_chi > q:  # (0.0028893):
            # print "pruning"
            classCounts = getClassCounts(examples, className)
            # print classCounts
            currentMax = classCounts[classCounts.keys()[0]]
            currentFeature = classCounts.keys()[0]
            for key in classCounts.keys():
                if classCounts[key] > currentMax:
                    currentMax = classCounts[key]
                    currentFeature = key
            return LeafNode(currentFeature)
        else:

            newRemainingAttributes = []

            # remove bestAttribute from list
            for attribute in remainingAttributes:
                if attribute != bestAttribute:
                    newRemainingAttributes.append(attribute)

            for valueOfBest in attributeValues[bestAttribute]:
                newExamples = getPertinentExamples(examples, bestAttribute,
                                                   valueOfBest)

                classCounts = getClassCounts(examples, className)
                currentMax = classCounts[classCounts.keys()[0]]
                currentFeature = classCounts.keys()[0]
                for key in classCounts.keys():
                    if classCounts[key] > currentMax:
                        currentMax = classCounts[key]
                        currentFeature = key
                node.children[valueOfBest] = makePrunedSubtrees(
                    newRemainingAttributes, newExamples, attributeValues,
                    className, currentFeature, setScoreFunc, gainFunc, q)
            return node
Exemple #53
0
    def _mcnemar_p_value(self):
        p = 1 - chi2.cdf(self.mcnemar_x2_statistic, 1)

        return p
def CDF_trial(N, x):
    """Calculated theoretical expectation for CDF"""
    return np.array(np.power(chi2.cdf(np.array(x), 4), N))
# (https://markusthill.github.io/mahalanbis-chi-squared/#the-squared-mahalanobis-distance-follows-a-chi-square-distribution-more-formal-derivation)
#
# Given a cutoff value associated with the statistical significance
# with which we want to determine outliers, we obtain the corresponding
# threshold value above which to consider an observation an outlier
cutoff = 0.98
degrees_of_freedom = df_transf.shape[1]  # given by the number of variables (columns)
cut = chi2.ppf(cutoff, degrees_of_freedom) # threshold value

# Squared Mahalanobis distance values of outliers
D[D > cut]

# %%
# Calculate the probability that the distance D[5]
# is an outlier
chi2.cdf(D[5], degrees_of_freedom)

# %%
# Calulate if the observation is an outlier given the cutoff
is_outlier_arr = (D > cut)

# Calculate the probability that an observation is an outlier not by chance
outliers_stat_proba = np.zeros(len(is_outlier_arr))

for i in range(len(is_outlier_arr)):
    outliers_stat_proba[i] = chi2.cdf(D[i], degrees_of_freedom)

# How many outliers with statistical significance greater than the cutoff
len(outliers_stat_proba[outliers_stat_proba > cutoff])

def prob(N, max2F):
    """Works out the probability given a number of templates and a maximum twoF"""
    littleP = 1 - chi2.cdf(max2F, 4)
    return N * littleP * pow(chi2.cdf(max2F, 4), N)
Exemple #57
0
 def _chi2_test(self):
     degrees_of_freedom = (self.data.shape - np.array((1, 1))).prod()
     pf = lambda x: 1. - chi2.cdf(x, degrees_of_freedom)
     self.chi2Val = self.inertia * (self.data.sum())
     self.p = pf(self.chi2Val)

# Make 200 guesses along the sensible domain of N
# TODO use optimisation tools/package or Newton-Raphson instead
Nvector = np.linspace(0.5 * Nmean, Nmean, num=1200)

ksPlot = [ksDist(NIdx, CDF_empir, CDF_binVals) for NIdx in Nvector]
Neff = Nvector[np.where(ksPlot == np.min(ksPlot))[0][0]]
#ksMin = min([ ksDist(NIdx, CDF_empir, CDF_binVals) for NIdx in np.linspace(0, Ntot, num=50) ] )

############################################################
#5) Find further look threshold by evaluating where [ CDF(Chi2(Neff,4))== \alpha ] for confidence level \alpha
############################################################

# Simple theoretical probability the overall max2F came from Gaussian noise
P2Fmax = 1 - chi2.cdf(max2F, 4)


def prob(N, max2F):
    """Works out the probability given a number of templates and a maximum twoF"""
    littleP = 1 - chi2.cdf(max2F, 4)
    return N * littleP * pow(chi2.cdf(max2F, 4), N)


Pval = prob(Neff * Ntot / Nmean, max2F)

############################################################
# Find x, where p(x) is first expected to be > 95%
############################################################

Exemple #59
0
    
output.write('\n')
    
# I think this is not motivated
#logL_lomax = -N/2 * ( np.log(2*np.pi) + 1 - np.log(N) + np.log(ssq_lomax) )

# vs lomax
#LRT = -2*logL_exp1 + 2*logL_lomax
#p  = 1 - chi2.cdf(LRT, np_lomax - np_exp1)

# vs 2-exp
    
if dont_do == False:

    LRT = -2*logL_exp1 + 2*logL_exp2
    p  = 1 - chi2.cdf(LRT, np_exp2 - np_exp1)
    
    # F-ratio
    # np2 must be greater than np1
    # ie model1 is restricted and model2 is unrestricted nested model
    
    F_ratio = ((ssq_exp1 - ssq_exp2) / (np_exp2 - np_exp1)) / ((ssq_exp2 / (N - np_exp2)))
    p_F = 1 - f.cdf(F_ratio, np_exp2 - np_exp1, N - np_exp2)
    
    plt.suptitle('LRT: '+str(p)+'\nF-ratio: '+str(p_F))

    output.write('log-likelihood 1-term exponential: '+str(logL_exp1)+' | Tf='+str(T_1)+'\n')
    output.write('log-likelihood 2-term exponential: '+str(logL_exp2)+'| Tf='+', '.join([str(x) for x in T_2])+'\n')
    output.write('\n')
    output.write('LRT: '+str(LRT)+' | P-value: '+str(p)+'\n')
    output.write('F-ratio: '+str(F_ratio)+' | P-value: '+str(p_F)+' | Tf='+str(T_lomax)+'\n')
def hubreg(yx,Xx,c=None,sig0=None,b0=None,printitn=0,ITERMAX = 2000,ERRORTOL = 1e-5):

    # ensure that y is Nx1 and not just N and proper formats
    y = np.copy(np.asarray(yx))
    X = np.copy(np.asarray(Xx))
    y = y if not len(y.shape)==2 else y.flatten()
    
    n,p = X.shape
    
    realdata = np.isrealobj(y)
    
    if c is None:
        c = 1.3415 if realdata else 1.215
        # Default: approx 95 efficiency for Gaussian errors
    
    if b0 is None:
        b0 = np.linalg.lstsq(X[range(len(y)),:],y,rcond=None)[0]
        
    if sig0 is None:
        sig0 = np.linalg.norm(y-X@b0)/np.sqrt(n-p)
        
    csq = c**2
    
    if realdata:
        qn = chi2.cdf(csq,1)
        alpha = chi2.cdf(csq,3)+csq*(1-qn) # consistency factor for scale
    else:
        qn = chi2.cdf(2*csq,2)
        alpha = chi2.cdf(2*csq,4)+csq*(1-qn) # consistency factor for scale
        
    Z = np.linalg.pinv(X)[0] # svd <1e-15 are set to zero
    con = np.sqrt((n-p)*alpha)

    i=1
    
    while i <= ITERMAX:
        # Step 1: update residual
        r = y - X@b0[:,np.newaxis].flatten()    
        psires = rsp.psihub(r/sig0,c)*sig0

        # Step 2: Update the scale
        sig1 = np.linalg.norm(psires)/con

        # Step 3: Update the pseudo-residual
        psires = rsp.psihub(r/sig1,c)*sig1

        # Step 4: regresses X on pseudoresidual
        update = Z@psires # update should be vector not matrix
 
        # Step 6: Check convergence
        crit2 = np.linalg.norm(update)/np.linalg.norm(b0)
        
        # Step 5: update the Beta
        b0 += update
        
        if printitn >0 and i%printitn==0:
            print('hubreg: crit(%4d) = %.9f\n' %(i,crit2))
            
        if crit2 < ERRORTOL: break

        sig0 = sig1
        
    
    if i == ITERMAX: print('error!!! MAXiter = %d crit2 = %.7f\n' % (iter,crit2))
    return b0, sig1, i