def filter(self, data, *args, **kwargs): if (len(data) == 4): c1 = data[0] n1 = data[1] c2 = data[2] n2 = data[3] else: c1 = data[0] n1 = self.n1 c2 = data[1] n2 = self.n2 if c1.ndim > 2: p = c1.shape[0] else: p = 1 lnq = p * ((n1 + n2) * np.log(n1 + n2) - n1 * np.log(n1) - n2 * np.log(n2)) + \ n1 * np.log(block_det(c1)) + \ n2 * np.log(block_det(c2)) - \ (n1 + n2) * np.log(block_det(c1 + c2)) rho = 1 - (2 * p ** 2 - 1) * (1 / n1 + 1 / n2 - 1 / (n1 + n2)) / (6 * p) o_2 = p ** 2 * (p ** 2 - 1) * \ (1 / n1 ** 2 + 1 / n2 ** 2 - 1 / (n1 + n2) ** 2) / (24 * rho ** 2) - \ 0.25 * p ** 2 * (1 - 1 / rho) ** 2 lnq *= -2 * rho pfa = (1 - o_2) * chi2.cdf(lnq, p ** 2) + o_2 * chi2.cdf(lnq, p ** 2 + 4) return (pfa, median_filter(pfa > self.p_thresh, 3, mode='constant', cval=0))
def serial_test(bits, size=5): n = len(bits) omeg = [0,0,0] for i in range(min(3, size)): omeg[i] = 2**(size - i) / n * sum([x**2 for x in fqs(bits, size - i)]) - n domeg = [0,0] domeg[0] = omeg[0] - omeg[1] domeg[1] = omeg[0] - 2*omeg[1] + omeg[2] return [1 - chi2.cdf(domeg[0], 2**(size-1)), 1 - chi2.cdf(domeg[1], 2**(size - 2))]
def truncated_chi2_mean(c, k): """ chi2 mean up to the cutoff. Compute A/B with: A = integral[0..c] x chi2(k) dx B = integral[0..c] chi2(k) dx B is computed via chi2.cdf and A is computed via: A = k * integral[0..c] chi2(k+2) dx """ A = k * chi2.cdf(c, k + 2) B = chi2.cdf(c, k) return A / B
def Main(): global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: out=open(args.output,"w") print >>out,"# INPUT FILE:",args.input print >>out,"# PVALUE CUT:",args.pvalue print >>out,"# MAX GAP:",args.gap for chrom,pos,x2s,matrix_x2,snps in parseIterRegion(args.input): ''' computer pvalue (from i to j) p[i][j] is the pvalue we are here to segments the snps into smaller region ''' print >>sys.stderr,"BiGSEGMent",chrom,len(pos),pos[0],pos[-1] if args.fast: if len(pos)>1000: SqOR=0.0 for k in range(len(pos)): SqOR+=x2s[k] p=1.0-chi2.cdf(SqOR,len(pos)) print >>out,"BIGREGION\t",chrom,"\t",pos[0],"\t",pos[-1]+1,"\t",p for i in range(len(pos)): print >>out,"SNP\t",chrom,pos[i],snps[i],x2s[i],matrix_x2[i] print >>out,"" continue p=numpy.array([[1.0 for row in range(len(pos))] for col in range(len(pos))]) for i in range(len(pos)): SqOR=0.0 print >>sys.stderr,i,"\r", for j in range(i,len(pos)): SqOR+=x2s[j] p[i][j]=1.0-chi2.cdf(SqOR,j+1-i) # print "PV",i,j,p[i][j],SqOR,j+1-i for (start,end) in Segment(p): #print start,end,p[start][end] if (p[start][end]<args.pvalue): print >>out,"REGION\t",chrom,"\t",pos[start],"\t",pos[end]+1,"\t",p[start][end] for i in range(start,end+1): print >>out,"SNP\t",chrom,pos[i],snps[i],x2s[i],matrix_x2[i] print >>out,""
def SPLL(W1, W2, K=3): n1 = W1.shape[1] n2 = W2.shape[1] assert n1==n2, "The number of features must be the same for W1 and W2" n = n1 s1 = log_LL(W1,W2,K) s2 = log_LL(W2,W1,K) st = max(s1,s2) pst = min(chi2.cdf(st,n), 1-chi2.cdf(st,n)) Change = float(pst < 0.05) return Change, pst, st
def hypothesisTest(self, seq1, seq2, totalSeq1, totalSeq2): # Contingency table: # x1 x2 # y1 y2 x1 = seq1 x2 = seq2 y1 = totalSeq1 - x1 y2 = totalSeq2 - x2 if (x1 == 0 and x2 == 0) or (x1 == totalSeq1 or x2 == totalSeq2): return float('inf'), 1.0, 'degenerate case: suspect p-value' N = x1+x2+y1+y2 E00 = float((x1+x2) * (x1+y1)) / N E01 = float((x1+x2) * (x2+y2)) / N E10 = float((y1+y2) * (x1+y1)) / N E11 = float((y1+y2) * (x2+y2)) / N X2 = (abs(x1 - E00)-0.5)**2 / E00 X2 += (abs(x2 - E01)-0.5)**2 / E01 X2 += (abs(y1 - E10)-0.5)**2 / E10 X2 += (abs(y2 - E11)-0.5)**2 / E11 # calculate p-value pValueTwoSided = 1.0 - chi2.cdf(X2,1) return float('inf'), pValueTwoSided, ''
def combine_p_values(the_p_values, method='z', default_quantile=7.): """Combines p-values from repeat measurements into a single p-value. the_p_values: a list of p-values. method: String. 'z'|'fisher'. 'z' for using the weighted z-score. 'fisher' for using fisher's combined probability test. default_quantile: Float. Only used for z method. The quantile to use when the software's normal inverse cdf(p-value) is infinite """ if len(the_p_values) == 1 or sum(the_p_values) == 0: combined_p_value = sum(the_p_values) elif method.lower() == 'z': #combine p-values using weighted z-score. To not deal with inifinite #values replace the_quantiles = [] for the_p in the_p_values: the_quantile = norm.ppf(1.-the_p) if isinf(the_quantile): the_quantile = default_quantile the_quantiles.append(the_quantile) combined_p_value = norm.sf(sum(the_quantiles) / len(the_quantiles)**0.5) elif method.lower() == 'fisher': combined_p_value = 1-chi2.cdf(-2*sum(map(log, the_p_values)), 2*len(the_p_values)) return combined_p_value
def pval(self, data): """ Determines whether this model id better then a simple gaussian model. Uses a KStest on the data and determines the p-val of this distribution and a gaussian distribution. returns: (TF, this_p) TF indicates whether this is a reasonable approximation of the data. """ t_data = self.treatdata(data) n_mean = N.mean(t_data) n_std = N.std(t_data) simple_g = m_modal([('norm', (data.mean(), data.std()), 1)]) simple_g.use_bc = self.use_bc #give original data so not transformed twice ll_s = simple_g.log_likelihood(data) ll_this = self.log_likelihood(data) ratio = 2*(ll_this - ll_s) #print 'chi2', ratio # 6 degrees of freedom from Adam, # based on bimodality in blood paper # the real test uses 3 # from likelihood ratio test in wikipedia this_p = 1-chi2.cdf(ratio, 3) #print (ll_s, ll_this), ratio, this_p TF = this_p > 0.05 return TF, this_p
def getPValue(self): """returns p-value of chi^2 test""" p = chi2.cdf(self.getChisquare(), self.getDoF()) if p < 0.5: return p, 'l' else: return 1 - p, 'r'
def showTwilightPixelDeviationFromMedian(self,row,col): x = self.getChisq(row,col) reducedChisq = x['reducedChisq'] chisq = x['chisq'] percentDiffSpectrum = x['percentDiffSpectrum'] deltaPercentDiffSpectrum = x['deltaPercentDiffSpectrum'] nDeltaFromZero = x['nDeltaFromZero'] degreesOfFreedom = x['degreesOfFreedom'] print 'reduced chisq =',reducedChisq print 'P-value =',1-chi2.cdf(chisq,degreesOfFreedom) pop = PopUp(parent=self,title='showTwilightPixelDeviationFromMedian') pop.axes.errorbar(self.wvlBinEdges[:-1],percentDiffSpectrum,linestyle='-',color='k',yerr=deltaPercentDiffSpectrum) pop.axes.set_xlabel(r'$\lambda$ ($\AA$)') pop.axes.set_ylabel(r'percent difference') pop.axes.plot(self.wvlBinEdges[:-1],len(self.wvlBinEdges[:-1])*[0],'gray') axes2 = pop.axes.twinx() axes2.plot(self.wvlBinEdges[:-1],nDeltaFromZero,'m',alpha=.7) align_yaxis(pop.axes,0,axes2,0) axes2.set_ylabel(r'(pixelSpectrum-avgSpectrum)/$\sigma$',color='m') pop.axes.set_title('Deviation from Avg Spectrum (%d,%d)'%(row,col)) pop.draw() weights = self.flatInfo['weights'][row,col] pop = PopUp(parent=self,title='showTwilightPixelDeviationFromMedian') pop.axes.step(self.wvlBinEdges[:-1],self.averageTwilightSpectrum/self.wvlBinWidths,'k',label='avg') pop.axes.step(self.wvlBinEdges[:-1],self.twilightSpectra[row,col]/self.wvlBinWidths,'b',label='weighted') pop.axes.step(self.wvlBinEdges[:-1],(self.twilightSpectra[row,col]/weights)/self.wvlBinWidths,'r',label='raw') pop.axes.set_xlabel(r'$\lambda$ ($\AA$)') pop.axes.set_ylabel(r'counts per $\AA$') pop.axes.set_title('Twilight Spectrum (%d,%d)'%(row,col)) pop.axes.legend(loc='lower right') pop.draw()
def chisquare(observed, expected, threshold = 0.95, freedom = 0): '''Performs chi square test with given parameters observed is a list of observed values expected is a list of expected values observed and expected must be same length freedom is the degrees of freedom in the variable if freedom not provided, it will be set automatically as len - 1 threshold is probability threshold desired return value is a True or False depending on whether or not list is statistically significant True for split is "by chance" and prune the decision False for split is not "by chance" and do not prune the decision ''' if not freedom: freedom = len(expected) - 1 chi = 0 for ob, ex in zip(observed, expected): if ex: chi += (ob - ex) ** 2 / ex else: pass # Check what the mathematical convention for 0 is # looks like we should do # freedom = freedom - 1 pval = 1 - chi2.cdf(chi, freedom) return pval < threshold
def __init__(self, x, alpha=0.05, max_points=1000): # compute Mardia test coefficient n, p = x.shape # num points, num dimensions mu = np.mean(x, axis=0) C = np.cov(x.T, bias=1) if p > 1 else np.array([[np.var(x.T, ddof=1)]]) # squared Mahalanobis distance matrix # Note: this forms a full n x n matrix of distances, so will # fail for a large number of points. Kurtosis only requires # the diagonal elements so can be computed cheaply. If there # is no order to the points, skew could be estimated using only # the block diagonal dx = (x - mu[None, :])[:max_points] D = np.dot(dx, np.linalg.solve(C, dx.T)) kurtosis = np.sum(np.diag(D)**2)/n skewness = np.sum(D**3)/n**2 kurtosis_stat = (kurtosis - p*(p+2)) / sqrt(8*p*(p+2)/n) raw_skewness_stat = n*skewness/6 # Small sample correction converges to 1 as n increases, so it is # always safe to apply it small_sample_correction = (p+1)*(n+1)*(n+3)/((p+1)*(n+1)*n - n*6) skewness_stat = raw_skewness_stat * small_sample_correction dof = (p*(p+1)*(p+2))/6 # degrees of freedom for chisq test self.p_kurtosis = 2*(1 - norm.cdf(abs(kurtosis_stat))) self.p_skewness = 1 - chi2.cdf(skewness_stat, dof) self.reject_normal = self.p_kurtosis < alpha or self.p_skewness < alpha #print("kurtosis", kurtosis, kurtosis_stat, self.p_kurtosis) #print("skewness", skewness, skewness_stat, self.p_skewness) # compute entropy self.entropy = cov_entropy(C)
def combined_p_fisher(p_values): """Computes the combined P-value using Fisher's combined probability test. """ k = len(p_values) W = -2.0 * sum(map(math.log, p_values)) return 1.0 - chi2.cdf(W, df=2*k)
def _do_lrt(null_lrt, annot_lrt): """ Perform the likelihood ratio test using standard LRT. """ test = -2*(null_lrt - annot_lrt) test_result = 1 - chi2.cdf(test, 1) return test_result
def construct_tree(examples, attributes, threshold): if entropy(examples) == 0: return Node(classification=examples[0][label_index], examples=examples) elif len(attributes) == 0: counter = Counter(example[label_index] for example in examples) return Node(classification=max((counter[key], key) for key in counter)[1], examples=examples) else: best_attribute = get_best_attribute(examples, attributes) m, k = degrees_of_freedom(examples, best_attribute) s = s_value(m, k, examples, best_attribute) df = (m - 1) * (k - 1) p_value = 1 - chi2.cdf(s, df) if p_value > threshold: counter = Counter(example[label_index] for example in examples) return Node(classification=max((counter[key], key) for key in counter)[1], examples=examples) attributes.remove(best_attribute) root = Node(attribute=best_attribute, examples=examples) partition_map = partition_on_attribute(examples, best_attribute) for attribute_value in partition_map: partition = partition_map[attribute_value] if len(partition) == 0: counter = Counter(example[label_index] for example in examples) root.children[attribute_value] = Node(classification=max((counter[key], key) for key in counter)[1], examples=examples) else: root.children[attribute_value] = construct_tree(partition, attributes, threshold) attributes.append(best_attribute) return root
def test_count_chunk(gene_counts, disp_adj, sf, dmatrix0, dmatrix1, CFG, idx, log=False): pval = sp.zeros((gene_counts.shape[0], 1), dtype='float') pval.fill(sp.nan) for i in xrange(idx.shape[0]): if log: log_progress(i, idx.shape[0]) if sp.isnan(disp_adj[i]): continue response = gene_counts[i, :].astype('int') if sp.sum(response[:response.shape[0] / 2] == 0) >= CFG['max_0_frac'] * response.shape[0] / 2: pval[i] = 1 continue modNB0 = sm.GLM(response, dmatrix0, family=sm.families.NegativeBinomial(alpha=disp_adj[i]), offset=sp.log(sf)) modNB1 = sm.GLM(response, dmatrix1, family=sm.families.NegativeBinomial(alpha=disp_adj[i]), offset=sp.log(sf)) result0 = modNB0.fit() result1 = modNB1.fit() pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance, dmatrix1.shape[1] - dmatrix0.shape[1]) if log: log_progress(idx.shape[0], idx.shape[0]) print '' return (pval, idx)
def convert_chis_to_probs(chis,dof): chis = chis / np.min(chis) * dof prob = 1.0 - chi2.cdf(chis,dof) prob = prob / np.max(prob) return prob
def assert_chisquared(observed, expected, bin_edges=None, alpha=0.05, title=''): """Assert that the "observed" counts are close enough the "expected" counts with a chi2 test at the given confidence level. If the test fails, we'll save a histogram to disk. """ chi2_value = np.sum((observed - expected)**2 / expected) n_dof = len(observed) - 1 # number of degrees of freedom for the chi2 test pval = 1 - chi2distribution.cdf(chi2_value, n_dof) print 'observed' print observed print 'expected' print expected print 'pval', pval if pval < alpha: if HAVE_PYPLOT: pp.clf() pp.title(title) pp.bar(bin_edges[0:-1], observed, width=bin_edges[1] - bin_edges[0], alpha=0.5, label='observed', color='red') pp.bar(bin_edges[0:-1], expected, width=bin_edges[1] - bin_edges[0], alpha=0.5, label='expected', color='blue') pp.legend() for i in itertools.count(): path = 'hist-%d.png' % i if not os.path.exists(path): break pp.savefig(path) raise ValueError('p=%f (<%f), we reject null hypothesis that the distribution ' 'matches the expected one. saved histogram as %s' % (pval, alpha, path))
def func(data) : # data is a 2D python array dim1 = len(data) if(dim1 > 0) : dim2 = len(data[0]) if(dim1 <= 1 or dim2 <= 1) : print "Invalid data" else : # calculate the rowsums, colsums, tsum rowsums = [0]*dim1 colsums = [0]*dim2 tsum = 0 for i in range(0,dim1): for j in range(0,dim2) : rowsums[i] = rowsums[i] + data[i][j] colsums[j] = colsums[j] + data[i][j] tsum = sum(rowsums) print "r : " + str(rowsums) print "c : " + str(colsums) print "t : " + str(tsum) chiS = 0 exp = list() for i in range(0,2): d = list() for j in range(0,2) : e = (rowsums[i]*colsums[j])/tsum val = ((data[i][j] - e)**2)/e chiS = chiS + val d.append(val) exp.append(d) print chiS degree = (dim1 - 1)*(dim2 - 1) print degree print chi2.cdf(chiS, degree)
def computeProbabilityOfObservedOffset(x, y, p=None): """Compute probability that p is consistent with mean of distribution. For a 2 dimensional distribution of points, given by ``x`` and ``y``, compute the probability that the mean of the distribution is consistent with the input point ``p`` Inputs: ------------- x, y (float) Input values. Require that ``len(x) == len(y)`` Optional Inputs: ----------------- p (array of length 2) Point to test. Default is [0,0] Returns: ----------- probOffset (float) Probability that input point is consistent with mean of distribution. chiSquare (float) The chi squared of the point. For highly inconsistent points the computed probability of offset flatlines at zero. The chisquare value can then be used to estimate the relative consistencies of different points. Notes: --------- See ``plotErrorEllipse`` for a description of the algorithm """ if p is None: p = [0, 0] p = np.array(p) assert(len(p) == 2) assert(len(x) == len(y)) if len(x) < 2: raise ValueError("Need at least two points to compute probability of offset") mu = np.array([np.mean(x), np.mean(y)]) cov = np.cov(x, y) / np.sqrt(len(x)) eigenVals, eigenVecs = np.linalg.eigh(cov) v1 = eigenVecs[:, 0] v2 = eigenVecs[:, 1] pDash = (p-mu) offset_pix = np.array([np.dot(pDash, v1), np.dot(pDash, v2)]) sigma = np.sqrt(eigenVals) offset_sigma = offset_pix / sigma s = np.sum(offset_sigma**2) probOffset = chi2.cdf(s, 2) return probOffset, s
def corr_circular_linear(alpha, X): # Authors: Jean-Remi King <*****@*****.**> # # Licence : BSD-simplified """ Parameters ---------- alpha : numpy.array, shape (n_angles,) The angular data (if n_dims == 1, repeated across all x dimensions) X : numpy.array, shape (n_angles, n_dims) The linear data Returns ------- R : numpy.array, shape (n_dims) R values R2 : numpy.array, shape (n_dims) R square values p_val : numpy.array, shape (n_dims) P values Adapted from: Circular Statistics Toolbox for Matlab By Philipp Berens, 2009 [email protected] - www.kyb.mpg.de/~berens/circStat.html Equantion 27.47 """ from scipy.stats import chi2 from jr.utils import pairwise import numpy as np # computes correlation for sin and cos separately # WIP Applies repeated correlation if X is vector # TODO: deals with non repeated correlations (X * ALPHA) if alpha.ndim > 1: rxs = repeated_corr(np.sin(alpha), X) rxc = repeated_corr(np.cos(alpha), X) rcs = np.zeros_like(alpha[0, :]) rcs = pairwise(np.sin(alpha), np.cos(alpha), func=_loop_corr, n_jobs=-1) else: # WIP Applies repeated correlation if alpha is vector rxs = repeated_corr(X, np.sin(alpha)) rxc = repeated_corr(X, np.cos(alpha)) rcs = repeated_corr(np.sin(alpha), np.cos(alpha)) # Adapted from equation 27.47 R = (rxc ** 2 + rxs ** 2 - 2 * rxc * rxs * rcs) / (1 - rcs ** 2) # JR adhoc way of having a sign.... R = np.sign(rxs) * np.sign(rxc) * R R2 = np.sqrt(R ** 2) # Get degrees of freedom n = len(X) pval = 1 - chi2.cdf(n * R2, 2) return R, R2, pval
def update_w(claim, index, count, truth, m, n, eps=1e-15): rtn = -np.ones(m) for i in range(n): rtn[index[i]] = rtn[index[i]] + (claim[i]-truth[i])**2 rtn[rtn==0] = 1e10 rtn[rtn>0] = chi2.cdf(0.025, count[rtn>0])/rtn[rtn>0] #rtn[rtn>0] = chi2.interval(0.05, count[rtn>0])[0]/rtn[rtn>0] return(rtn)
def solve_chi_squared(chi_squared_value=None, f=None, p=None): max_1_none(chi_squared_value, f, p) if chi_squared_value == None: return chi_squared(f,p) elif f == None: raise NotImplemented("Not implemented yet - sorry") elif p == None: return sympify(sci_chi2.cdf(float(chi_squared_value), float(f)))
def compute_ANOVA(self, mu=None, mu_start=0, return_weights=0): """ Returns -2 log likelihood, the pvalue and the maximum likelihood estimate for a common mean. Parameters ---------- mu : float If a mu is specified, ANOVA is conducted with mu as the common mean. Otherwise, the common mean is the maximum empirical likelihood estimate of the common mean. Default is None. mu_start : float Starting value for commean mean if specific mu is not specified. Default = 0 return_weights : bool if TRUE, returns the weights on observations that maximize the likelihood. Default is FALSE Returns ------- res: tuple The log-likelihood, p-value and estimate for the common mean. """ if mu is not None: llr = self._opt_common_mu(mu) pval = 1 - chi2.cdf(llr, self.num_groups - 1) if return_weights: return llr, pval, mu, self.new_weights else: return llr, pval, mu else: res = optimize.fmin_powell(self._opt_common_mu, mu_start, full_output=1) llr = res[1] mu_common = float(res[0]) pval = 1 - chi2.cdf(llr, self.num_groups - 1) if return_weights: return llr, pval, mu_common, self.new_weights else: return llr, pval, mu_common
def TS2sigma(TS,dof, quiet=False): """ one-sided Chi^2 test """ pval_1 = chi2.cdf(TS, dof) sigma=math.sqrt(2)*sp.erfinv(pval_1) if not quiet: print "TS=%.2f\t->\t%.2f sigma"%(TS,sigma) return sigma
def chi_square_test(sequence, k): frequencies = {i: 0 for i in range(k)} for element in sequence: frequencies[floor(element * k)] += 1 expected_frequency = float(len(sequence)) / k chi_square = sum( (frequency - expected_frequency) ** 2 / expected_frequency for frequency in list(frequencies.values())) return chi_square, chi2.cdf(chi_square, k)
def block_runs(bitblocks): bitblocks = prepare_bitblocks(bitblocks) try: B, _, K = BNKlookup(bitlength(bitblocks)) except(IndexError): print("Tried (and failed) to count runs in a set that is less than 128 elements.") else: chi = chi_squared(bitblocks) return 1 - chi2.cdf(chi, K)
def reduced_chi_square(xvals, yvals, sigy, func, numparam): """ Returns the reduced chi-squared, pvalue, and DOF of the fit. """ c = 0 n = len(xvals) - numparam for x, y, s in zip(xvals, yvals, sigy): c += (y-func(x))**2/(s**2) return c/n, float(1-chi2.cdf(c/n,n)), n
def NOTM(bits, blocksize=110000, tmpltsz=9): template = maketemplate(tmpltsz) blocks = partition_bits(bits, blocksize) binpow = 1/2**tmpltsz theor_mean = binpow*(blocksize - tmpltsz + 1) theor_var = blocksize*(binpow - binpow**2*(2*tmpltsz - 1)) chi = 0 for block in blocks: chi += (find_matches(block, template) - theor_mean)**2 / theor_var return 1 - chi2.cdf(chi, 2)
def combine_exact(self): # Hypothesis Test: # H0(i->j): i can be an ancestor of j # HA(i-*>j): i cannot be an ancestor of j statistic = np.zeros(self.dim) for index in range(self.size): # H0 specifies that all sampled diff values have + means --> # all the observed positive values are not any different than H+ null # hypothesis --> they contribute a multiplicative value of 1 to the LR # statistic # the observed negative values are where the ratio difference lies, and those # will be forced to be generated by mean of 0 under H+ mask = (self.data[index] < 0).astype(int) # sum up the (-2ln(x) ) for x corresponding each of # the terms in likelihood ratio # put zero where elements where the delta is missing and # is filled with place holder statistic = statistic + \ ((self.data[index]/ self.sigma[index])**2)* \ mask * self.isfilled[index] # sum over isfilled attribute to get the total number # of samples available for comparison of a pair of mutations totalSamples = sum(self.isfilled) # get the list of existing number of available samples across # all pairs of mutations countRange = map(int, list(np.unique(totalSamples))) # start by a blank input for pvalue self.pvalue = np.zeros(self.dim) for value in countRange: # for each value in existing set of available samples for a pair # assign the p-value for such pairs using exact test formulation # for that total sample count mask = (totalSamples == value).astype(int) if value == 0: # if no samples available for a pair of mutations # use pvalue of 1 to indicate lack of rejection # (insufficient information) self.pvalue += mask continue n = value pvalue = np.zeros(self.dim) for k in range(1, 1+n): pvalue += (1-chi2.cdf(statistic,k)) * \ choose(n, k, exact = True) / (2.0 ** n) self.pvalue += pvalue * mask
def get_p_value(T): # same as scipy.stats.chi2_contingency(T, correction=False) det = T[0,0]*T[1,1] - T[0,1]*T[1,0] c2 = float(det) / T[0].sum() * det / T[1].sum() * T.sum() / T[:,0].sum() / T[:,1].sum() p = 1 - chi2.cdf(x=c2, df=1) return p
x, y, err = np.loadtxt(fname, unpack=True) # Read the data in # n = len(x) # No. of data points # # Initial values of parameters # p0 = [1.0, 0.01] f = lambda x, a, b: (a / (x * x)) + b # Read about lambda notation here : http://www.secnetix.de/olli/Python/lambda_functions.hawk # p, covm = curve_fit(f, x, y, p0, err) # Do the fit a, b = p chisq = sum(((f(x, a, b) - y) / err)**2) # Compute chi-squared ndf = n - len(p) # no. of degrees of freedom print("NDF is ", ndf) Q = 1. - chi2.cdf(chisq, ndf) # Quality of fit parameter : Q , More the better ! # chisq = chisq / ndf # Compute chi-squared per DOF covm = covm / chisq # ! aerr, berr = np.sqrt(np.diag(covm)) # Set the error bars # ====================================== # Print the results # # p[0] = a ; p[1] = b print("-----") print("a = %10.8f +/- %7.8f with %7.4f percent error " % (a, aerr, (aerr / a) * 100)) print("b = %10.7f %7.7f with %7.4f percent error " % (b, berr, (berr / b) * 100)) print("chi squared / NDF = %7.4lf" % chisq) print("CL = %2.2f" % Q) # Checked that it 1 - pvalue (gnuplot) ; 1 - P
from scipy.stats import chi2 chi_square = sum([(o - e)**2. / e for o, e in zip(Observed_Values, Expected_Values)]) chi_square_statistic = chi_square[0] + chi_square[1] print("chi-square statistic:-", chi_square_statistic) # In[38]: #critical_value critical_value = chi2.ppf(q=1 - alpha, df=df) print('critical_value:', critical_value) # In[39]: #p-value p_value = 1 - chi2.cdf(x=chi_square_statistic, df=df) print('p-value:', p_value) # In[40]: print('Significance level: ', alpha) print('Degree of Freedom: ', df) print('chi-square statistic:', chi_square_statistic) print('critical_value:', critical_value) print('p-value:', p_value) # In[27]: #compare chi_square_statistic with critical_value and p-value which is the probability of getting chi-square>0.09 (chi_square_statistic) if chi_square_statistic >= critical_value: print("Reject H0,There is a relationship between 2 categorical variables")
#!/usr/bin/env python from __future__ import print_function import pylab as pl import scipy.optimize from scipy.stats import chi2 for fa_rate in 1.0/pl.array([1e1, 1e2, 1e4, 1e6, 1e9]): print(fa_rate) for df in range(1,7): f_eq = lambda x: ((1- fa_rate) - chi2.cdf(x, df))**2 res = scipy.optimize.minimize(f_eq, df) assert res['success'] print('\t', res.x[0])
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 } ALPHA = 0.05 players = [player for player in Player if get_season(player) in TEST_SEASONS] train_input = [float(get_age(player)) for player in players] train_output = [1.0 if get_is_mol(player) else 0.0 for player in players] non_mol = np.array( [data for data, label in zip(train_input, train_output) if label == 0.0]) mol = np.array( [data for data, label in zip(train_input, train_output) if label == 1.0]) non_mol_kde = InnerAppearanceLayer.kernel_density_estimation(non_mol) non_mol_bandwidth = silverman_bandwidth(np.array(non_mol)) non_mol_points = len(non_mol) mol_kde = InnerAppearanceLayer.kernel_density_estimation(mol) mol_bandwidth = silverman_bandwidth(np.array(mol)) mol_points = len(mol) ages = [float(age) for age in range(20, 59)] statistic_values = [ test_statistic(age, non_mol_kde, non_mol_bandwidth, non_mol_points, mol_kde, mol_bandwidth, mol_points) for age in ages ] p_values = [chi2.cdf(x, 1) for x in statistic_values] p_order = sorted(p_values) hochman = [ALPHA / (len(p_order) - j + 1) for j in range(1, len(p_order) + 1)] indices = [ i for i, pair in enumerate(zip(p_order, hochman)) if pair[0] <= pair[1] ] print(indices)
def chisq_test(O, E, degree=3, sig_level=0.05): measured_val = sum( [(o - e)**2/e for (o, e) in zip(O, E)] ) return chi2.cdf(measured_val, degree), chi2.sf(measured_val, degree)
def _get_p_value(self) -> None: self.p_value = 1 - chi2.cdf(x=self.chi_square_statistic, df=self.degree_of_freedom)
def get_pvalue(gene, data, ratio=0.5, isplot=True): lifespan_table = pd.DataFrame(index=['visit7', 'visit8', 'visit9', 'visit10'], columns=['obs_num1', 'prob1', 'expt_num1', 'obs_num2', 'expt_num2', 'prob2', 'obs_num']) gene_median = np.median(data.loc[:, gene]) group_high = data.loc[data.loc[:, gene] >= gene_median] group_low = data.loc[data.loc[:, gene] < gene_median] # cutoff = np.quantile(data.iloc[:, 0:4], quantile) cutoff = np.min(np.min(data.iloc[:, 0:4])) * ratio n1 = 9 # sample num in each group n2 = 10 # sample num in each group n = n1 + n2 # ---visit7--- obs_num1 = np.size(np.where(group_high.loc[:, 'visit07_L*'] < cutoff)) obs_num2 = np.size(np.where(group_low.loc[:, 'visit07_L*'] < cutoff)) prob1 = obs_num1 / n1 prob2 = obs_num2 / n2 obs_num = obs_num1 + obs_num2 expt_num1 = obs_num*(n1/n) expt_num2 = obs_num*(n2/n) lifespan_table.loc['visit7', :] = [obs_num1, prob1, expt_num1, obs_num2, expt_num2, prob2, obs_num] # ---visit8--- obs_num1 = np.size(np.where(group_high.loc[:, 'visit08_L*'] < cutoff)) obs_num2 = np.size(np.where(group_low.loc[:, 'visit08_L*'] < cutoff)) prob1 = obs_num1 / n1 prob2 = obs_num2 / n2 obs_num = obs_num1 + obs_num2 expt_num1 = obs_num*(n1/n) expt_num2 = obs_num*(n2/n) lifespan_table.loc['visit8', :] = [obs_num1, prob1, expt_num1, obs_num2, expt_num2, prob2, obs_num] # ---visit9--- obs_num1 = np.size(np.where(group_high.loc[:, 'visit09_L*'] < cutoff)) obs_num2 = np.size(np.where(group_low.loc[:, 'visit09_L*'] < cutoff)) prob1 = obs_num1 / n1 prob2 = obs_num2 / n2 obs_num = obs_num1 + obs_num2 expt_num1 = obs_num*(n1/n) expt_num2 = obs_num*(n2/n) lifespan_table.loc['visit9', :] = [obs_num1, prob1, expt_num1, obs_num2, expt_num2, prob2, obs_num] # ---visit10--- obs_num1 = np.size(np.where(group_high.loc[:, 'visit10_L*'] < cutoff)) obs_num2 = np.size(np.where(group_low.loc[:, 'visit10_L*'] < cutoff)) prob1 = obs_num1 / n1 prob2 = obs_num2 / n2 obs_num = obs_num1 + obs_num2 expt_num1 = obs_num*(n1/n) expt_num2 = obs_num*(n2/n) lifespan_table.loc['visit10', :] = [obs_num1, prob1, expt_num1, obs_num2, expt_num2, prob2, obs_num] O1 = lifespan_table.loc[:, 'obs_num1'].sum() O2 = lifespan_table.loc[:, 'obs_num2'].sum() E1 = lifespan_table.loc[:, 'expt_num1'].sum() E2 = lifespan_table.loc[:, 'expt_num2'].sum() X = np.power(O1-E1, 2)/E1 + np.power(O2-E2, 2)/E2 p_value = 1 - chi2.cdf(X, df=1) print(gene, p_value) if p_value < 0.05 and isplot: prob1 = lifespan_table.loc[:, 'prob1'] prob2 = lifespan_table.loc[:, 'prob2'] draw_surv_plot(prob1, prob2, gene, p_value, ratio) draw_box_plot(data, gene, ratio) return p_value
def test_count(data, opts): """ Make a test for all genes iteratively. @args data: Store all input data and results @type data: Class object @args opts: Input argument to the main TE function @type opts: Instance """ print 'Start the statistical test.' num = len(data.geneIDs) pval = np.empty((num, 1)) pval.fill(np.nan) explanatory0 = cm.create_matrix(data, model='H0') explanatory1 = cm.create_matrix(data, model='H1') librarySizes = np.hstack([data.libSizesRibo, data.libSizesRna]) lenSampleRibo = data.idxRibo.size lenSampleRna = data.idxRna.size errorCnt = 0 for i in range(num): sys.stdout.flush() if i % 50 == 0: print '\r%i genes finished...' % i, if i + 1 == num: print '\r%i genes finished.' % num if opts.dispDiff and np.isnan(data.dispAdjRibo[i]): continue if not opts.dispDiff and np.isnan(data.dispAdj[i]): continue response = np.hstack([data.countRibo[i, :], data.countRna[i, :]]) if opts.dispDiff: disp = np.hstack([ np.repeat(data.dispAdjRibo[i], lenSampleRibo), np.repeat(data.dispAdjRna[i], lenSampleRna) ]) else: disp = data.dispAdj[i] try: modNB0 = sm.GLM(response, explanatory0, family=sm.families.NegativeBinomial(alpha=disp), offset=np.log(librarySizes)) modNB1 = sm.GLM(response, explanatory1, family=sm.families.NegativeBinomial(alpha=disp), offset=np.log(librarySizes)) result0 = modNB0.fit() result1 = modNB1.fit() except sm.tools.sm_exceptions.PerfectSeparationError: errorCnt += 1 else: if not opts.dispDiff: pval[i] = 1 - chi2.cdf( result0.deviance - result1.deviance, explanatory1.shape[1] - explanatory0.shape[1]) elif opts.dispDiff: pval[i] = 1 - chi2.cdf( result0.deviance - result1.deviance, (explanatory1.shape[1] - explanatory0.shape[1]) / 2.5) else: pass data.pval = pval sys.stdout.write( 'Warning: Failed to do test: %i genes. P value set to \'nan\'.\n' % errorCnt) return data
def compare(self, contrast, alpha=0.05): """Compare predictions given a contrast If there are two predictions, you can compare as: roc.compare(contrast=[1, -1], alpha=0.05) """ # Validate alpha if (alpha <= 0) | (alpha >= 1): raise ValueError('alpha must be in the range (0, 1), exclusive.') elif alpha > 0.5: alpha = 1 - alpha # Verify if covariance was calculated if self.S is None: self._calculate_covariance() # L as matrix L = np.array(contrast, dtype=float) if len(L.shape) == 1: L = L.reshape(1, L.shape[0]) # Shapes L_sz = L.shape S_sz = self.S.shape # is not equal to number of classifiers if (S_sz[1] != L_sz[1]): # Contrast column raise ValueError( 'Contrast should have %d elements (number of predictors)', S_sz[1]) # Validate contrast if np.any(np.sum(L, axis=1) != 0): raise ValueError('Contrast rows must sum to 0', S_sz[1]) # Calculate LSL matrix LSL = L @ self.S @ np.transpose(L) # Normal vs chi^2 distribution if L_sz[0] == 1: # Compute using the normal distribution mu = L @ np.transpose(self.auc) sigma = np.sqrt(LSL) thetaP = norm.cdf(0, mu, sigma) # 2-sided test, double the tails -> double the p-value if mu < 0: thetaP = 2 * (1 - thetaP) else: thetaP = 2 * thetaP # Confidence intervals theta2 = norm.ppf([alpha / 2, 1 - alpha / 2], mu, sigma) else: # Calculate chi2 stat with DOF = rank(L*S*L') # first invert the LSL matrix inv_LSL = np.linalg.inv(LSL) # then calculate the chi2 w_chi2 = self.auc @ np.transpose(L) @ inv_LSL @ L @ np.transpose( self.auc) w_df = np.linalg.matrix_rank(np.transpose(LSL)) thetaP = 1 - chi2.cdf(w_chi2, w_df) theta2 = chi2.ppf([alpha / 2, 1 - alpha / 2], w_df) return np.ndarray.item(thetaP), theta2
5. P-VALUE: PASAR O NO EL TEST La probalidad de encontrar resultados tan extremos (valores de la derecha, positivos) Probabilidad condicional: Encontrar valores extremos dado que la distribución es normal MAL: Proba de que sea normal dado el valor que obtuve ''' x_mean = np.mean(x) x_std = np.std(x) x_skew = skew(x) x_kurtosis = kurtosis(x) #Kurtosis en exceso k-3 x_jb_stat = nb_sim / 6 * (x_skew**2 + 1 / 4 * x_kurtosis**2) #Que tan lejos estas de la normalidad #Necesariamente chico p_value = 1 - chi2.cdf(x_jb_stat, df=2) #Se distribuye chi2 con 2 grados de libertad #Si valor–p < nivel de significación => Rechazo H0. #Si valor–p > nivel de significación => No rechazo H0. x_is_normal = (p_value > 0.05) #equivalente a jb <6 print('skewness is ' + str(x_skew)) print('kurtosis is ' + str(x_kurtosis)) print('Jarque-Bera statistic is ' + str(x_jb_stat)) print('p-value is ' + str(p_value)) print('is normal ' + str(x_is_normal)) #jb_list = [] #jb_list.appennd(x_jb_stat) #Plot histogram
def result_cross(F, Y, T, N, Q): Lambda, SD, CV, p_value = test_statistic_cross(F, Y, T, N, Q) Lambda_1 = (N - Q) * Lambda p_value_1 = 1 - chi2.cdf(Lambda_1, N - Q) return (Lambda, p_value, Lambda_1, p_value_1)
#%% import numpy as np from scipy.stats import chi2 from scipy.stats import chisquare import matplotlib.pyplot as plt df = 3 rv = chi2(df) vals = chi2.ppf([0.001, 0.5, 0.999], df) np.allclose([0.001, 0.5, 0.999], chi2.cdf(vals, df)) stat = [101, 189, 317, 393] e = [100, 200, 300, 400] k2 = 0 for i in range(len(e)): k2 += (stat[i] - e[i])**2 / e[i] print(k2) print(chisquare(stat, f_exp=e)) p = 1 - chi2.cdf(k2, df) print(p) x = np.linspace(0, 10, 1000) alpha = 0.05 k2 = chisquare(stat, f_exp=e)[:1][0] y = chi2.sf(x, df) plt.plot(x, y) plt.hlines(alpha, 0, 10, colors='r')
from scipy import stats from scipy.stats import t as t_dist from scipy.stats import chi2 from abtesting_test import * import math # You can comment out these lines! They are just here to help follow along to the tutorial. print(t_dist.cdf(-2, 20)) # should print .02963 print(t_dist.cdf( 2, 20)) # positive t-score (bad), should print .97036 (= 1 - .2963) print(chi2.cdf(23.6, 12)) # prints 0.976 print(1 - chi2.cdf(23.6, 12)) # prints 1 - 0.976 = 0.023 (yay!) # TODO: Fill in the following functions! Be sure to delete "pass" when you want to use/run a function! # NOTE: You should not be using any outside libraries or functions other than the simple operators (+, **, etc) # and the specifically mentioned functions (i.e. round, cdf functions...) def slice_2D(list_2D, start_row, end_row, start_col, end_col): ''' Splices a the 2D list via start_row:end_row and start_col:end_col :param list: list of list of numbers :param nums: start_row, end_row, start_col, end_col :return: the spliced 2D list (ending indices are exclsive) ''' to_append = [] for l in range(start_row, end_row): to_append.append(list_2D[l][start_col:end_col])
def get_filtered_VariantsLogRatio(self): iter = 0 #get most abundant base self.maxA = np.argmax(self.freq, axis=1) #get second most abundant base ftemp = np.copy(self.freq) for v in range(self.V): ftemp[v, self.maxA[v]] = -1 self.maxB = np.argmax(ftemp, axis=1) N = (self.freq.sum(axis=1)).astype(np.float64) n = (self.freq.max(axis=1)).astype( np.float64) #value of most abundant base m = (ftemp.max(axis=1)).astype( np.float64) #value of second most abundant e = N - n self.filtered = N < self.Nthreshold self.minV = np.zeros(self.V) self.minV[self.filtered == False] = m[self.filtered == False] / N[ self.filtered == False] p = np.zeros(self.V) MLL = np.zeros(self.V) lastSelect = 0 Select = self.V p[self.filtered == False] = n[self.filtered == False] / N[self.filtered == False] p[p > self.upperP] = self.upperP while iter < self.max_iter and lastSelect != Select: #filter based on current error rate BLL = -(log(self.eta[self.maxA, :]) * self.freq).sum(axis=1) for v in range(self.V): if self.optimise: res = minimize_scalar(mixNLL, bounds=(0.0, self.upperP), args=(self.eta, self.maxA[v], self.maxB[v], self.ffreq[v, :]), method='bounded') p[v] = res.x MLL[v] = mixNLL(p[v], self.eta, self.maxA[v], self.maxB[v], self.ffreq[v, :]) ratioNLL = 2.0 * (BLL - MLL) self.filtered = np.logical_or(N < self.Nthreshold, ratioNLL < self.threshold) ff = self.freq[self.filtered] af = self.maxA[self.filtered] Nf = N[self.filtered] self.eta = 96 * np.identity((4)) + np.ones((4, 4)) for v in range(Nf.shape[0]): self.eta[af[v], :] += ff[v, :] esums = self.eta.sum(axis=1) self.eta = self.eta / esums[:, np.newaxis] lastSelect = Select Select = self.V - self.filtered.sum() logging.info("Variant filter iter: " + str(iter) + " " + str(Select) + " " + str(self.eta)) sys.stdout.flush() iter = iter + 1 self.pvalue = 1.0 - chi2.cdf(ratioNLL, 1) self.qvalue = benjamini_Hochberg(self.pvalue) self.ratioNLL = ratioNLL self.filtered = np.logical_or(N < self.Nthreshold, self.qvalue > self.qvalue_cutoff) self.snps_filter = self.snps_filter[self.filtered != True, :, :] self.selected_indices = list(np.where(self.filtered != True)) self.selected_indices = self.selected_indices[0].tolist() self.selected = self.filtered != True self.NS = self.snps_filter.shape[0] return self.snps_filter
def chi_sq(t, x, err, f, n, df): chi_sq = sum((x - f(t, n))**2/err**2) p_value = 1 - chi2.cdf(chi_sq, df) return chi_sq, p_value
mRNA, samples=25, plot=False) # calculate v v0 = sum(res0**2) vA = sum(resA**2) vC = sum(resC**2) vD = sum(resD**2) # calculate dw dw0 = durbin_watson(res0) dwA = durbin_watson(resA) dwC = durbin_watson(resC) dwD = durbin_watson(resD) # p-values based on explicit testing chi2p0 = 1 - chi2.cdf(v0, n - 1) chi2pA = 1 - chi2.cdf(vA, n - 2) chi2pC = 1 - chi2.cdf(vC, n - 3) chi2pD = 1 - chi2.cdf(vD, n - 4) # explicit testing # rejection because of chi2 test r0_ex = chi2p0 < 0.05 rA_ex = chi2pA < 0.05 rC_ex = chi2pC < 0.05 rD_ex = chi2pD < 0.05 # get lower limit for dw-test (explicit) dL0 = float(dw[(dw.n == n) & (dw.m == 2)].dL) dLA = float(dw[(dw.n == n) & (dw.m == 2)].dL) dLC = float(dw[(dw.n == n) & (dw.m == 3)].dL) if n == 7: dLD = float(dw[(dw.n == n) & (dw.m == 3)].dL)
y_data = log_MF[ok] y_data_err = (data["std90_pc_cen"][ok]**2. + data["dN_counts_cen"][ok]**(-1.))**(0.5) #p.hist(n.log10(y_data_err), bins=10) #p.show() ps = n.array([0.333, 0.794, 0.247]) log_fsigma = lambda logsigma, A, a, p : n.log10(lib.f_BH(10**-logsigma, A, a, p, 1.)) print "ST01 fit ----------------------" pOpt, pCov=curve_fit(log_fsigma, x_data, y_data, ps, sigma = y_data_err, maxfev=50000000)#, bounds=boundaries) chi2 = n.sum(((log_fsigma(x_data, pOpt[0], pOpt[1], pOpt[2])-y_data)/y_data_err)**2. ) ndof = (len(x_data) - len(ps)) print "best params=", pOpt print "err=", pCov.diagonal()**0.5 print "chi2 ", chi2, ndof, chi2/ndof print "P chi2 1-cdf", 1-stc2.cdf(int(chi2),ndof) print "---------------------------------------------------" pOpt_ST01 = pOpt pErr_ST01 = pCov.diagonal()**0.5 n.savetxt(join(os.environ['MVIR_DIR'],"mvirFunction_parameters_ST01_MFonly_fit.txt"), n.transpose([pOpt_ST01, pErr_ST01]), header="A a p") print "BATT 2011" ps = n.array([A0, a0, p0, q0]) log_fsigma = lambda logsigma, A, a, p, q : n.log10(lib.f_BH(10**-logsigma, A, a, p, q)) pOpt, pCov=curve_fit(log_fsigma, x_data, y_data, ps, sigma = y_data_err, maxfev=50000000)#, bounds=boundaries) chi2 = n.sum(((log_fsigma(x_data, pOpt[0], pOpt[1], pOpt[2], pOpt[3])-y_data)/y_data_err)**2. ) ndof = (len(x_data) - len(ps)) print "best params=", pOpt print "err=", pCov.diagonal()**0.5
def gal_smass(id, i, ierr, gr, ri, iz, gre, rie, ize, zed, splines, zmet, galaxy): print galaxy, " at z = ", zed cd = CosmologicalDistance.CosmologicalDistance() ldistDict = dict() rest_gr, rest_gi, weight = [], [], [] masslight, sfrs, ages, zmets, kii, kri = [], [], [], [], [], [] minChiSq = 999 spIndex = -1 for sp in range(0, len(splines)): # for speed skey = str(sp) + "-" + str(zed) sgr = splines[sp][0](zed) sri = splines[sp][1](zed) siz = splines[sp][2](zed) sgrr = splines[sp][4](zed) # restframe g-r sgir = splines[sp][5](zed) # restframe g-i skii = splines[sp][6](zed) # kcorrection: i_o - i_obs skri = splines[sp][7](zed) # kcorrection: r_o - i_obs sml = splines[sp][8](zed) # log(mass/light) (M_sun/L_sun) ssfr = splines[sp][9](zed) sage_cosmic = splines[sp][10](zed) sage = splines[sp][11](zed) szmet = zmet[sp] #To be changed if SFH changes gr_chisq = pow((gr - sgr) / gre, 2) ri_chisq = pow((ri - sri) / rie, 2) iz_chisq = pow((iz - siz) / ize, 2) rest_gr.append(sgrr) rest_gi.append(sgir) kii.append(skii) kri.append(skri) masslight.append(sml) sfrs.append(ssfr) ages.append(sage) zmets.append(szmet) chisq = gr_chisq + ri_chisq + iz_chisq probability = 1 - chi2.cdf(chisq, 3 - 1) # probability of chisq greater than this weight.append(probability) spIndex = np.argmax(weight) rest_gr = np.array(rest_gr) rest_gi = np.array(rest_gi) kii = np.array(kii) kri = np.array(kri) masslight = np.array(masslight) sfrs = np.array(sfrs) ages = np.array(ages) weight = np.array(weight) gr_weighted = rest_gr * weight gi_weighted = rest_gi * weight kii_weighted = kii * weight kri_weighted = kri * weight masslight_weighted = masslight * weight sfr_weighted = sfrs * weight age_weighted = ages * weight zmet_weighted = zmets * weight w1 = weight.sum() w2 = (weight**2).sum() if w1 == 0: w1 = 1e-10 if w2 == 0: w2 = 1e-10 mean_gr = gr_weighted.sum() / w1 mean_gi = gi_weighted.sum() / w1 mean_kii = kii_weighted.sum() / w1 mean_kri = kri_weighted.sum() / w1 mean_masslight = masslight_weighted.sum() / w1 mean_sfr = sfr_weighted.sum() / w1 mean_age = age_weighted.sum() / w1 mean_zmet = zmet_weighted.sum() / w1 # unbiased weighted estimator of the sample variance w3 = w1**2 - w2 if w3 == 0: w3 = 1e-10 var_gr = (w1 / w3) * (weight * (rest_gr - mean_gr)**2).sum() var_gi = (w1 / w3) * (weight * (rest_gi - mean_gi)**2).sum() var_kii = (w1 / w3) * (weight * (kii - mean_kii)**2).sum() var_kri = (w1 / w3) * (weight * (kri - mean_kii)**2).sum() var_masslight = (w1 / w3) * (weight * (masslight - mean_masslight)**2).sum() var_sfr = (w1 / w3) * (weight * (sfrs - mean_sfr)**2).sum() var_age = (w1 / w3) * (weight * (ages - mean_age)**2).sum() var_zmet = (w1 / w3) * (weight * (zmets - mean_zmet)**2).sum() std_gr = var_gr**0.5 std_gi = var_gi**0.5 std_kii = var_kii**0.5 std_kri = var_kri**0.5 std_masslight = var_masslight**0.5 std_sfr = var_sfr**0.5 std_age = var_age**0.5 std_zmet = var_zmet**0.5 if std_gr > 99.99: std_gr = 99.99 if std_gi > 99.99: std_gi = 99.99 if std_kii > 99.99: std_kii = 99.99 if std_kri > 99.99: std_kri = 99.99 if std_sfr > 99.99: std_sfr = 99.99 if std_age > 99.99: std_age = 99.99 if std_masslight > 99.99: std_masslight = 99.99 if std_zmet > 99.99: std_zmet = 99.99 # Comment -distanceModulus out for fsps versions <2.5, as their mags don't include distance modulus if zed in ldistDict: lumdist = ldistDict[zed] else: lumdist = cd.luminosity_distance(zed) # in Mpc ldistDict[zed] = lumdist distanceModulus = 5 * np.log10(lumdist * 1e6 / 10.) iabs = i + mean_kii - distanceModulus rabs = i + mean_kri - distanceModulus taMass = taylorMass(mean_gi, iabs) mcMass = mcintoshMass(mean_gr, rabs) fsMass = fspsMass(mean_masslight, iabs) # JTA: to make purely distance modulus #iabs = i[galaxy] - distanceModulus #fsMass = gstarMass( iabs ) # saving for output # perhaps: out_id[galaxy] = id[galaxy] return [id, zed, mean_gr , std_gr, mean_gi, std_gi, mean_kii, std_kii, mean_kri, std_kri , i, distanceModulus , iabs,\ rabs, mcMass, taMass, fsMass, std_masslight,spIndex,zmets[spIndex],mean_sfr,std_sfr, mean_age, std_age, mean_zmet, std_zmet ]
def Doornik_Hansen_MVN_test(X, verbose=1): print("==========================================") print("Doornik-Hansen Multivariate Normality Test") print("------------------------------------------") n = X.shape[0] k = X.shape[1] print("the number of dimesions :", k) print("the number of sample points:", n) print("------------------------------------------") # === x_avg ======================================= x_avg = (1 / n) * np.sum(X, axis=0) x_avg = x_avg.reshape((k, 1)) # === inverse of sample_covariance_matrix ========= S = np.cov(X.T) # for numpy function np.cov(M), # each row of M represents a variable (dimension) # each column of M represent a single observation # thus, transpose required S_inv = np.linalg.inv(S) # inverse of S # === V and C = VSV (correlation matrix) ========== D = np.sqrt(np.diag(S)) D = np.diagflat(D) V = np.linalg.inv(D) C = np.dot(np.dot(V, S), V) # correlation matrix # === eigenvectors and eignevalues of C eig_val, eig_vec = np.linalg.eig(C) L = np.diagflat(eig_val**-0.5) H = eig_vec.T # columns are eigenvectors # === transformed X X_center = X - x_avg.T dot_prod = np.dot(X_center, V) dot_prod = np.dot(dot_prod, H) dot_prod = np.dot(dot_prod, L) X_transf = np.dot(dot_prod, H.T) # === skewness and kurtosis of transformed X skewn_list = [] kurto_list = [] for dimension in range(k): x = X_transf[:, dimension] skewn = skew(x) kurto = kurtosis(x, fisher=False) # Pearson definition skewn_list.append(skewn) kurto_list.append(kurto) # === z1 and z2 ================================== z1_list = [] z2_list = [] for p in range(k): b1 = (skewn_list[p])**2 b2 = kurto_list[p] beta = (3 * (n**2 + 27 * n - 70) * (n + 1) * (n + 3)) / ((n - 2) * (n + 5) * (n + 7) * (n + 9)) ohm_2 = -1 + (2 * (beta - 1))**0.5 delta1 = (np.log10(ohm_2**0.5))**(-0.5) y = ((b1 * (ohm_2 - 1) * (n + 1) * (n + 3)) / (12 * (n - 2)))**(0.5) z1 = delta1 * (np.log10(y + (1 + y**2)**0.5)) z1_list.append(z1) # ------------------------------------------------------------- delta2 = (n - 3) * (n + 1) * (n**2 + 15 * n - 4) a = ((n - 2) * (n + 5) * (n + 7) * (n**2 + 27 * n - 70)) / (6 * delta2) c = ((n - 7) * (n + 5) * (n + 7) * (n**2 + 2 * n - 5)) / (6 * delta2) f = ((n + 5) * (n + 7) * (n**3 + 37 * (n**2) + 11 * n - 313)) / (12 * delta2) alpha = a + b1 * c chi = 2 * f * (b2 - 1 - b1) z2 = ((9 * alpha)**0.5) * ((chi / (2 * alpha))**(1 / 3) - 1 + (1 / (9 * alpha))) z2_list.append(z2) Z1 = np.array(z1_list).reshape(k, 1) # vertical vector Z2 = np.array(z2_list).reshape(k, 1) # vertical vector # === chi2 prob ================================== statistic = (np.dot(Z1.T, Z2) + np.dot(Z2.T, Z2)) #[0][0] chi2_df = 2 * k p_chi2 = 1 - chi2.cdf(statistic, chi2_df) print("statistic: %.4f" % statistic) print("p-val : %.4f" % p_chi2) print("------------------------------------------") print("If p-val < alpha (0.05), reject H0.") print("Note that H0 is multi-normality,") print("larger p-val indicate multi-normality") print("==========================================") return None
def run_site_branch(cluster_name, treefile, alignment, folder_temp, folder_plots): from ete2 import EvolTree from ete2.treeview.layouts import evol_clean_layout import os from collections import defaultdict import math from scipy.stats import chi2 print "Processing cluster: " + cluster_name tree = EvolTree(treefile) tree.link_to_alignment(alignment, alg_format="fasta", nucleotides=True) #Create temporal folder temp_cluster_folder = folder_temp + "/" + cluster_name if not os.path.exists(temp_cluster_folder): os.makedirs(temp_cluster_folder) tree.workdir = temp_cluster_folder #Run M0 as the null model tree.run_model("M0") #Look at the site selection on each branch printed_tree = 0 i = 0 #Output list with the results output_list = [] for node in tree.iter_descendants(): #Mark the tree for the leaf under analysis tree.mark_tree([node.node_id], marks=["#1"]) #Use the node id as folder name temp_leaf_name = str(node.node_id) print "Processing: " + cluster_name + " " + temp_leaf_name + " " + ",".join(node.get_leaf_names()) #Run computation of each model. #From the notes on ETE: # to organize a bit, we name model with the name of the marked node # any character after the dot, in model name, is not taken into account # for computation. (have a look in /tmp/ete2.../bsA.. directory) tree.run_model("bsA." + temp_leaf_name) tree.run_model("bsA1." + temp_leaf_name) bsA = tree.get_evol_model("bsA." + temp_leaf_name) bsA1 = tree.get_evol_model("bsA1." + temp_leaf_name) ps_sites = defaultdict() total_sites = 0 sites_over_95 = 0 for s in range(len(bsA.sites['BEB']['aa'])): p_value_site = float(bsA.sites['BEB']['p2'][s]) if p_value_site > 0.50: ps_sites[s] = [bsA.sites['BEB']['aa'][s], bsA.sites['BEB']['p2'][s]] total_sites += 1 if p_value_site > 0.95: sites_over_95 += 1 #ps = float(tree.get_most_likely("bsA." + temp_leaf_name, "bsA1." + temp_leaf_name)) rx = float(tree.get_most_likely("bsA1." + temp_leaf_name, "M0")) lrt_value = 2 * math.fabs(bsA1.lnL - bsA.lnL) # LRT test value ps = 1 - chi2.cdf(lrt_value, 1) # p-value based on chi-square test_status = None #Evidence of positive selection in the branch omega_value = float(bsA.classes['foreground w'][2]) proportion_sites = float(bsA.classes['proportions'][2]) #Plot file plot_file = folder_plots + "/" + cluster_name if ps < 0.05 and omega_value > 1: #Save plots, both in jpg and svg of the clusters with evidence of positive selection test_status = "Positive" if printed_tree == 0: #tree.render(plot_file + ".svg", layout=evol_clean_layout) #tree.render(plot_file + ".jpg", layout=evol_clean_layout) printed_tree = 1 else: continue elif rx < 0.05 and ps >= 0.05: test_status = "Relaxed" else: #print "no signal" test_status = None #Remove marks on the tree tree.mark_tree(map(lambda x: x.node_id, tree.get_descendants()), marks=[''] * len(tree.get_descendants()), verbose=False) result_entry = [cluster_name, node.node_id, omega_value, proportion_sites, ps, test_status, total_sites, sites_over_95, ",".join(node.get_leaf_names())] # print result_entry #print ps_sites #node_results[node.node_id] = [result_entry, ps_sites] node_result = [result_entry, ps_sites] output_list.append(node_result) return output_list
def makePrunedSubtrees(remainingAttributes, examples, attributeValues, className, defaultLabel, setScoreFunc, gainFunc, q): """ Creates a classification tree Node and all its children. This returns a Node, which is the root Node of the tree constructed from the passed in parameters. This should be implemented recursively, and handle base cases for zero examples or remainingAttributes as covered in the book. Args: remainingAttributes (list<string>): the names of attributes still not used examples (list<dictionary<str,str>>): list of examples attrValues (dictionary<string,list<string>>): list of possible values for attribute className (str): the name of the class defaultLabel (string): the default label setScoreFunc (func): the function to score classes (ie classEntropy or gini) gainFunc (func): the function to score gain of attributes (ie entropyGain or giniGain) q (float): the Chi-Squared pruning parameter Returns: Node or LeafNode The classification tree node optimal for the remaining set of attributes. """ # YOUR CODE HERE (Extra Credit) if len(examples) == 0: return LeafNode(defaultLabel) else: matching = True currentClassification = examples[0][className] for dictionary in examples: if dictionary[className] != currentClassification: matching = False if matching: return LeafNode(currentClassification) if len(remainingAttributes) == 0: classCounts = getClassCounts(examples, className) currentMax = classCounts[classCounts.keys()[0]] currentFeature = classCounts.keys()[0] for key in classCounts.keys(): if classCounts[key] > currentMax: currentMax = classCounts[key] currentFeature = key return LeafNode(currentFeature) else: maxScore = float( gainFunc(examples, remainingAttributes[0], attributeValues[remainingAttributes[0]], className)) bestAttribute = remainingAttributes[0] for attribute in remainingAttributes: currentScore = float( (gainFunc(examples, attribute, attributeValues[attribute], className))) if abs(currentScore) > abs(maxScore): bestAttribute = attribute maxScore = currentScore node = Node(bestAttribute) # a list of all attributesValues we're splitting on bestAttributeValues = attributeValues[bestAttribute] # make list of class values classificationValues = [] for dictionary in examples: for key in dictionary.keys(): if key == className and not (dictionary[key] in classificationValues): classificationValues.append(dictionary[key]) classCountsForExamples = getClassCounts(examples, className) # chi statistic sum = 0.0 # go through splitValues for splitValue in bestAttributeValues: # split on values subset = getPertinentExamples(examples, bestAttribute, splitValue) classCountsForSubset = getClassCounts(subset, className) # go through all class values for classValue in classificationValues: if classValue in classCountsForExamples.keys(): numberOfValueInExamples = classCountsForExamples[ classValue] # print "number in example ", numberOfValueInExamples probabilityOfValueInExamples = float( numberOfValueInExamples) / float(len(examples)) else: probabilityOfValueInExamples = 0.0 if classValue in classCountsForSubset.keys(): numberOfValueInSplit = classCountsForSubset[classValue] # print "number in split ", numberOfValueInSplit probabilityOfValueInSplit = float(numberOfValueInSplit) else: probabilityOfValueInSplit = 0.0 p_hat = probabilityOfValueInExamples * float(len(subset)) if p_hat != 0.0: currentSum = float( ((abs(p_hat - probabilityOfValueInSplit))**2) / p_hat) sum += currentSum # this is v degreesOfFreedom = (len(bestAttributeValues) - 1 ) # * (len(classificationValues) - 1) p_value_chi = 1 - chi2.cdf(sum, degreesOfFreedom) # print p_value_chi ##print sum # p_value_chi = chisqprob(sum, degreesOfFreedom) # print p_value_chi # print q if p_value_chi > q: # (0.0028893): # print "pruning" classCounts = getClassCounts(examples, className) # print classCounts currentMax = classCounts[classCounts.keys()[0]] currentFeature = classCounts.keys()[0] for key in classCounts.keys(): if classCounts[key] > currentMax: currentMax = classCounts[key] currentFeature = key return LeafNode(currentFeature) else: newRemainingAttributes = [] # remove bestAttribute from list for attribute in remainingAttributes: if attribute != bestAttribute: newRemainingAttributes.append(attribute) for valueOfBest in attributeValues[bestAttribute]: newExamples = getPertinentExamples(examples, bestAttribute, valueOfBest) classCounts = getClassCounts(examples, className) currentMax = classCounts[classCounts.keys()[0]] currentFeature = classCounts.keys()[0] for key in classCounts.keys(): if classCounts[key] > currentMax: currentMax = classCounts[key] currentFeature = key node.children[valueOfBest] = makePrunedSubtrees( newRemainingAttributes, newExamples, attributeValues, className, currentFeature, setScoreFunc, gainFunc, q) return node
def _mcnemar_p_value(self): p = 1 - chi2.cdf(self.mcnemar_x2_statistic, 1) return p
def CDF_trial(N, x): """Calculated theoretical expectation for CDF""" return np.array(np.power(chi2.cdf(np.array(x), 4), N))
# (https://markusthill.github.io/mahalanbis-chi-squared/#the-squared-mahalanobis-distance-follows-a-chi-square-distribution-more-formal-derivation) # # Given a cutoff value associated with the statistical significance # with which we want to determine outliers, we obtain the corresponding # threshold value above which to consider an observation an outlier cutoff = 0.98 degrees_of_freedom = df_transf.shape[1] # given by the number of variables (columns) cut = chi2.ppf(cutoff, degrees_of_freedom) # threshold value # Squared Mahalanobis distance values of outliers D[D > cut] # %% # Calculate the probability that the distance D[5] # is an outlier chi2.cdf(D[5], degrees_of_freedom) # %% # Calulate if the observation is an outlier given the cutoff is_outlier_arr = (D > cut) # Calculate the probability that an observation is an outlier not by chance outliers_stat_proba = np.zeros(len(is_outlier_arr)) for i in range(len(is_outlier_arr)): outliers_stat_proba[i] = chi2.cdf(D[i], degrees_of_freedom) # How many outliers with statistical significance greater than the cutoff len(outliers_stat_proba[outliers_stat_proba > cutoff])
def prob(N, max2F): """Works out the probability given a number of templates and a maximum twoF""" littleP = 1 - chi2.cdf(max2F, 4) return N * littleP * pow(chi2.cdf(max2F, 4), N)
def _chi2_test(self): degrees_of_freedom = (self.data.shape - np.array((1, 1))).prod() pf = lambda x: 1. - chi2.cdf(x, degrees_of_freedom) self.chi2Val = self.inertia * (self.data.sum()) self.p = pf(self.chi2Val)
# Make 200 guesses along the sensible domain of N # TODO use optimisation tools/package or Newton-Raphson instead Nvector = np.linspace(0.5 * Nmean, Nmean, num=1200) ksPlot = [ksDist(NIdx, CDF_empir, CDF_binVals) for NIdx in Nvector] Neff = Nvector[np.where(ksPlot == np.min(ksPlot))[0][0]] #ksMin = min([ ksDist(NIdx, CDF_empir, CDF_binVals) for NIdx in np.linspace(0, Ntot, num=50) ] ) ############################################################ #5) Find further look threshold by evaluating where [ CDF(Chi2(Neff,4))== \alpha ] for confidence level \alpha ############################################################ # Simple theoretical probability the overall max2F came from Gaussian noise P2Fmax = 1 - chi2.cdf(max2F, 4) def prob(N, max2F): """Works out the probability given a number of templates and a maximum twoF""" littleP = 1 - chi2.cdf(max2F, 4) return N * littleP * pow(chi2.cdf(max2F, 4), N) Pval = prob(Neff * Ntot / Nmean, max2F) ############################################################ # Find x, where p(x) is first expected to be > 95% ############################################################
output.write('\n') # I think this is not motivated #logL_lomax = -N/2 * ( np.log(2*np.pi) + 1 - np.log(N) + np.log(ssq_lomax) ) # vs lomax #LRT = -2*logL_exp1 + 2*logL_lomax #p = 1 - chi2.cdf(LRT, np_lomax - np_exp1) # vs 2-exp if dont_do == False: LRT = -2*logL_exp1 + 2*logL_exp2 p = 1 - chi2.cdf(LRT, np_exp2 - np_exp1) # F-ratio # np2 must be greater than np1 # ie model1 is restricted and model2 is unrestricted nested model F_ratio = ((ssq_exp1 - ssq_exp2) / (np_exp2 - np_exp1)) / ((ssq_exp2 / (N - np_exp2))) p_F = 1 - f.cdf(F_ratio, np_exp2 - np_exp1, N - np_exp2) plt.suptitle('LRT: '+str(p)+'\nF-ratio: '+str(p_F)) output.write('log-likelihood 1-term exponential: '+str(logL_exp1)+' | Tf='+str(T_1)+'\n') output.write('log-likelihood 2-term exponential: '+str(logL_exp2)+'| Tf='+', '.join([str(x) for x in T_2])+'\n') output.write('\n') output.write('LRT: '+str(LRT)+' | P-value: '+str(p)+'\n') output.write('F-ratio: '+str(F_ratio)+' | P-value: '+str(p_F)+' | Tf='+str(T_lomax)+'\n')
def hubreg(yx,Xx,c=None,sig0=None,b0=None,printitn=0,ITERMAX = 2000,ERRORTOL = 1e-5): # ensure that y is Nx1 and not just N and proper formats y = np.copy(np.asarray(yx)) X = np.copy(np.asarray(Xx)) y = y if not len(y.shape)==2 else y.flatten() n,p = X.shape realdata = np.isrealobj(y) if c is None: c = 1.3415 if realdata else 1.215 # Default: approx 95 efficiency for Gaussian errors if b0 is None: b0 = np.linalg.lstsq(X[range(len(y)),:],y,rcond=None)[0] if sig0 is None: sig0 = np.linalg.norm(y-X@b0)/np.sqrt(n-p) csq = c**2 if realdata: qn = chi2.cdf(csq,1) alpha = chi2.cdf(csq,3)+csq*(1-qn) # consistency factor for scale else: qn = chi2.cdf(2*csq,2) alpha = chi2.cdf(2*csq,4)+csq*(1-qn) # consistency factor for scale Z = np.linalg.pinv(X)[0] # svd <1e-15 are set to zero con = np.sqrt((n-p)*alpha) i=1 while i <= ITERMAX: # Step 1: update residual r = y - X@b0[:,np.newaxis].flatten() psires = rsp.psihub(r/sig0,c)*sig0 # Step 2: Update the scale sig1 = np.linalg.norm(psires)/con # Step 3: Update the pseudo-residual psires = rsp.psihub(r/sig1,c)*sig1 # Step 4: regresses X on pseudoresidual update = Z@psires # update should be vector not matrix # Step 6: Check convergence crit2 = np.linalg.norm(update)/np.linalg.norm(b0) # Step 5: update the Beta b0 += update if printitn >0 and i%printitn==0: print('hubreg: crit(%4d) = %.9f\n' %(i,crit2)) if crit2 < ERRORTOL: break sig0 = sig1 if i == ITERMAX: print('error!!! MAXiter = %d crit2 = %.7f\n' % (iter,crit2)) return b0, sig1, i