def rsig_student(ndof_eff, alpha=0.95): """ USAGE ----- Rsigt = rsig_student(ndof_eff, alpha=0.95) References ---------- https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient Example ------- TODO """ ndof = ndof_eff - 2 ## Find the critical value of r from the Student t distribution ## by inverting the survival function (1-CDF). pval = 1 - alpha tcrit = student.isf(pval,ndof) ## Convert the critical value of the t statistic ## into a critical value of r. rcrit_t = tcrit/np.sqrt(ndof + tcrit**2) return rcrit_t
def run(self, seqGroup1, seqGroup2, parentSeqGroup1, parentSeqGroup2, confIntervMethod, coverage): note = '' n1 = len(seqGroup1) n2 = len(seqGroup2) if n1 >= 2 and n2 >= 2: # calculate proportions propGroup1 = [] for i in xrange(0, n1): propGroup1.append(float(seqGroup1[i]) / parentSeqGroup1[i]) propGroup2 = [] for i in xrange(0, n2): propGroup2.append(float(seqGroup2[i]) / parentSeqGroup2[i]) # calculate p-value, effect size, and CI meanG1 = float(sum(propGroup1)) / n1 meanG2 = float(sum(propGroup2)) / n2 dp = meanG1 - meanG2 varG1 = variance(propGroup1, meanG1) varG2 = variance(propGroup2, meanG2) normVarG1 = varG1 / n1 normVarG2 = varG2 / n2 unpooledVar = normVarG1 + normVarG2 sqrtUnpooledVar = math.sqrt(unpooledVar) if unpooledVar != 0: # p-value T_statistic = (meanG1 - meanG2) / sqrtUnpooledVar dof = (unpooledVar*unpooledVar) / ( (normVarG1*normVarG1)/(n1-1) + (normVarG2*normVarG2)/(n2-1) ) pValue = t.cdf(T_statistic, dof) # CI tCritical = t.isf(0.5 * (1.0-coverage), dof) # 0.5 factor accounts from symmetric nature of distribution lowerCI = dp - tCritical*sqrtUnpooledVar upperCI = dp + tCritical*sqrtUnpooledVar else: if meanG1 != meanG2: pValue = 0.0 # the difference (at least according to these samples) must be true as there is no variance else: pValue = 0.5 lowerCI = dp upperCI = dp note = 'degenerate case: variance of both groups is zero' else: pValue = 0.5 lowerCI = 0.0 upperCI = 0.0 dp = 0.0 note = 'degenerate case: both groups must contain at least 2 samples' return 1.0 - pValue, 2*min(pValue, 1.0 - pValue), lowerCI*100, upperCI*100, dp*100, note
def run(self, seqGroup1, seqGroup2, parentSeqGroup1, parentSeqGroup2, confIntervMethod, coverage): note = '' n1 = len(seqGroup1) n2 = len(seqGroup2) try: if n1 < 2 or n2 < 2: raise Exception('degenerate case: both groups must contain at least 2 samples') # calculate proportions propGroup1 = [] for i in xrange(0, n1): propGroup1.append(float(seqGroup1[i]) / parentSeqGroup1[i]) propGroup2 = [] for i in xrange(0, n2): propGroup2.append(float(seqGroup2[i]) / parentSeqGroup2[i]) # calculate statistics meanG1 = float(sum(propGroup1)) / n1 meanG2 = float(sum(propGroup2)) / n2 dp = meanG1 - meanG2 varG1 = variance(propGroup1, meanG1) varG2 = variance(propGroup2, meanG2) dof = n1 + n2 - 2 pooledVar = ((n1 - 1)*varG1 + (n2 - 1)*varG2) / (n1 + n2 - 2) sqrtPooledVar = math.sqrt(pooledVar) denom = sqrtPooledVar * math.sqrt(1.0/n1 + 1.0/n2) # p-value T_statistic = (meanG1 - meanG2) / denom pValue = t.cdf(T_statistic, dof) # CI tCritical = t.isf(0.5 * (1.0-coverage), dof) # 0.5 factor accounts from symmetric nature of distribution lowerCI = dp - tCritical*denom upperCI = dp + tCritical*denom except Exception as note: pValue = 0.5 lowerCI = 0.0 upperCI = 0.0 dp = 0.0 except ZeroDivisionError: if meanG1 != meanG2: pValue = 0.0 # the difference (at least according to these samples) must be true as there is no variance else: pValue = 0.5 lowerCI = dp upperCI = dp note = 'degenerate case: variance of both groups is zero' return 1.0 - pValue, 2*min(pValue, 1.0 - pValue), lowerCI*100, upperCI*100, dp*100, note
def weighted_average(w_in,y_in,conf=None,do_std=True): """computes weighted average of y with weight w over axis x input ===== y_in: array or list of arrays to average. each element of the list is a numpy array each of these arrays has the structure (rec,val,....,x) where [val==0 is the mean] and [val==1 is the variance] w_in: array of weights. shape is (nrec,x) conf: confidance interval to use. (e.g., conf=0.95 for 95% conf. interval) if None (default), weighted standard deviation returned do_std: Bool Flag calculation of standard deviation/error. Default=True. output ====== WA: weighted average. If y_in is a list, WA is a list. WA[i].shape=y_in[i].shape[1:] WSTD: weighted stdard deviation/error. If none calculated, WSTD=None notes ===== """ assert type(w_in) is np.ndarray #assert type(y_list) is list assert w_in.ndim==2 if type(y_in) is list: y_list=y_in else: y_list=[y_in] for y in y_list: assert y.shape[0]==w_in.shape[0] assert y.shape[-1]==w_in.shape[-1] assert type(y) is np.ndarray # #right type # w=w.astype(np.float) #normalize w w=w_in/np.sum(w_in,axis=0) WA_list=[] sig_WA_list=None if do_std and w.shape[0]>2: Sww=np.sum(w*w,axis=0) sig_WA_list=[] for y in y_list: #type #y=y.astype(np.float) #reshape w for broadcasting #not strictly necessary, but prevents any odd things new_shape=[w.shape[0]]+[1]*(y.ndim-2)+[w.shape[-1]] w.shape=new_shape #Sw.shape=new_shape[1:] #Sww.shape=new_shape[1:] WA=np.sum(w*y,axis=0) WA_list.append(WA) #weighted std dev if w.shape[0]>2 and do_std: Sww.shape=new_shape[1:] delta=y-WA sig_WA=np.sum(w*delta*delta,axis=0) sig_WA=sig_WA/(1.-Sww) #msk out bad values? msk=(sig_WA>0) & (np.isfinite(sig_WA)) #zero bad points sig_WA[~msk]=0. sig_WA=np.sqrt(sig_WA) sig_WA_list.append(sig_WA) #print y.shape,WA.shape #do SE? if conf is not None and sig_WA_list is not None: sT=_ST.isf((1.-conf)*0.5,w.shape[0]-1) for i in range(len(sig_WA_list)): sig_WA_list[i]*=sT return WA_list,sig_WA_list
def spliced_ave_var(w_in,y_in,conf=None): """computes variance from pieces y with weight w over first axis input ===== w_in: array of weights. shape is (nrec,x) y_in: array or list of arrays of ave/variance each array of shape (nrec,val,...,x) val=0 => average val=1 => variance conf: confidance interval to use. (e.g., conf=0.95 for 95% conf. interval) if None (default), weighted standard deviation returned do_std: Bool Flag calculation of standard deviation/error. Default=True. output ====== Y: weighted average/variance. If y_in is a list, Y is a list. Y[i].shape=y_in[i].shape[1:] WSTD: weighted stdard deviation/error between blocks notes ===== """ assert type(w_in) is np.ndarray #assert type(y_list) is list assert w_in.ndim==2 if type(y_in) is list: y_list=y_in else: y_list=[y_in] for y in y_list: assert y.ndim>2 assert y.shape[0]==w_in.shape[0] assert y.shape[1]==2 assert y.shape[-1]==w_in.shape[-1] assert type(y) is np.ndarray # #right type # w=w.astype(np.float) nrec=w_in.shape[0] ny=len(y_list) #normalize w #w=w_in/np.sum(w_in,axis=0) weight=w_in/np.sum(w_in,axis=0) W1=np.zeros(weight.shape[1:]) #sum weight W2=np.zeros(weight.shape[1:]) #sum weight**2 M_V_1=[] #splice mean,variance M_V_2=[] #variance mean,variance V1=[] #mean of variance (not returned) w_shape=[] #reshaper for each y in y_list for y in y_list: M_V_1.append(np.zeros(y.shape[1:])) M_V_2.append(np.zeros(y.shape[1:])) V1.append(np.zeros(y.shape[2:])) # y less rec,val,x new_shape=[1]*(y.ndim-3)+[y.shape[-1]] w_shape.append(new_shape) #accumulate for rec in range(nrec): w=weight[rec,:] W1_last=W1.copy() #note, have copy to make sure not pointing W1+=w W2+=w*w for iy in range(ny): s=w_shape[iy] x=y_list[iy][rec,0,...] v=y_list[iy][rec,1,...] f0=(w/W1).reshape(s) f1=(W1_last*w/W1).reshape(s) delta=(x-M_V_1[iy][0,...]) delta_2=delta*delta #splice mean/var M_V_1[iy][0,...]+=delta*f0 #((w/W1).reshape(s)) M_V_1[iy][1,...]+=v*w+delta_2*f1 #(W1_last*w/W1).reshape(s) #variance of mean M_V_2[iy][0,...]+=delta_2*f1 #(W1_last*w/W1).reshape(s) #variance of variance delta=(v-V1[iy]) V1[iy]+=delta*f0 #(w/W1).reshape(s) M_V_2[iy][1,...]+=delta*delta*f1 #(W1_last*w/W1).reshape(s) #normalize spliced variance for iy in range(ny): M_V_1[iy][1,...]/=(W1.reshape(w_shape[iy])) #normalize variance of mean, variance of variance if nrec<2: fac=np.zeros(W1.shape) else: fac=W1/(W1**2-W2) for iy in range(ny): s=w_shape[iy] M_V_2[iy][0,...]*=fac.reshape(s) M_V_2[iy][1,...]*=fac.reshape(s) #mask out bads if nrec>1: msk=(M_V_2[iy]>0)&(np.isfinite(M_V_2[iy])) M_V_2[iy][~msk]=0. M_V_2[iy]=np.sqrt(M_V_2[iy]) else: M_V_2[iy]=np.zeros(M_V_2[iy].shape) #confidence interval if conf is not None and nrec>1: sT=_ST.isf((1.-conf)*0.5,nrec-1)/np.sqrt(nrec) for iy in range(ny): M_V_2[iy]*=sT return M_V_1,M_V_2
def run(self, seqGroup1, seqGroup2, parentSeqGroup1, parentSeqGroup2, confIntervMethod, coverage): note = '' n1 = len(seqGroup1) n2 = len(seqGroup2) try: if n1 < 2 or n2 < 2: raise Exception('degenerate case: both groups must contain at least 2 samples') # calculate proportions propGroup1 = [] for i in xrange(0, n1): if parentSeqGroup1[i] > 0: propGroup1.append(float(seqGroup1[i]) / parentSeqGroup1[i]) else: propGroup1.append( 0.0 ) note = 'degenerate case: parent group had a count of zero' propGroup2 = [] for i in xrange(0, n2): if parentSeqGroup2[i] > 0: propGroup2.append(float(seqGroup2[i]) / parentSeqGroup2[i]) else: propGroup2.append( 0.0 ) note = 'degenerate case: parent group had a count of zero' # calculate statistics meanG1 = float(sum(propGroup1)) / n1 meanG2 = float(sum(propGroup2)) / n2 dp = meanG1 - meanG2 varG1 = variance(propGroup1, meanG1) varG2 = variance(propGroup2, meanG2) dof = n1 + n2 - 2 pooledVar = ((n1 - 1)*varG1 + (n2 - 1)*varG2) / (n1 + n2 - 2) sqrtPooledVar = math.sqrt(pooledVar) denom = sqrtPooledVar * math.sqrt(1.0/n1 + 1.0/n2) # p-value T_statistic = (meanG1 - meanG2) / denom pValue = t.cdf(T_statistic, dof) # CI tCritical = t.isf(0.5 * (1.0-coverage), dof) # 0.5 factor accounts from symmetric nature of distribution lowerCI = dp - tCritical*denom upperCI = dp + tCritical*denom except Exception as note: pValue = 0.5 lowerCI = 0.0 upperCI = 0.0 dp = 0.0 except ZeroDivisionError: if meanG1 != meanG2: pValue = 0.0 # the difference (at least according to these samples) must be true as there is no variance else: pValue = 0.5 lowerCI = dp upperCI = dp note = 'degenerate case: variance of both groups is zero' return 1.0 - pValue, 2*min(pValue, 1.0 - pValue), lowerCI*100, upperCI*100, dp*100, note
def run(self, seqGroup1, seqGroup2, parentSeqGroup1, parentSeqGroup2, confIntervMethod, coverage): note = '' n1 = len(seqGroup1) n2 = len(seqGroup2) if n1 >= 2 and n2 >= 2: # calculate proportions propGroup1 = [] for i in xrange(0, n1): if parentSeqGroup1[i] > 0: propGroup1.append(float(seqGroup1[i]) / parentSeqGroup1[i]) else: propGroup1.append( 0.0 ) note = 'degenerate case: parent group had a count of zero' propGroup2 = [] for i in xrange(0, n2): if parentSeqGroup2[i] > 0: propGroup2.append(float(seqGroup2[i]) / parentSeqGroup2[i]) else: propGroup2.append( 0.0 ) note = 'degenerate case: parent group had a count of zero' # calculate p-value, effect size, and CI meanG1 = float(sum(propGroup1)) / n1 meanG2 = float(sum(propGroup2)) / n2 dp = meanG1 - meanG2 varG1 = var(propGroup1, ddof=1) varG2 = var(propGroup2, ddof=1) normVarG1 = varG1 / n1 normVarG2 = varG2 / n2 unpooledVar = normVarG1 + normVarG2 sqrtUnpooledVar = math.sqrt(unpooledVar) if unpooledVar != 0: # p-value T_statistic = (meanG1 - meanG2) / sqrtUnpooledVar dof = (unpooledVar*unpooledVar) / ( (normVarG1*normVarG1)/(n1-1) + (normVarG2*normVarG2)/(n2-1) ) pValue = t.cdf(T_statistic, dof) # CI tCritical = t.isf(0.5 * (1.0-coverage), dof) # 0.5 factor accounts from symmetric nature of distribution lowerCI = dp - tCritical*sqrtUnpooledVar upperCI = dp + tCritical*sqrtUnpooledVar else: if meanG1 != meanG2: pValue = 0.0 # the difference (at least according to these samples) must be true as there is no variance else: pValue = 0.5 lowerCI = dp upperCI = dp note = 'degenerate case: variance of both groups is zero' else: pValue = 0.5 lowerCI = 0.0 upperCI = 0.0 dp = 0.0 note = 'degenerate case: both groups must contain at least 2 samples' return 1.0 - pValue, 2*min(pValue, 1.0 - pValue), lowerCI*100, upperCI*100, dp*100, note