def ttest_range(sample, left, right): k = len(sample) - 1 left = stdtr(k, -t_statistic(sample, left)) print 'left:', left right = stdtr(k, t_statistic(sample, right)) print 'right:', right outside = left + right print 'outside:', outside inside = 1 - outside print 'inside:', inside return inside
def spearman_rs(l1,l2): """Compute Spearman-Rank Correlation Coefficient with corresponding p-Value""" if len(l1) == 0 or len(l2) == 0: print 'ERROR: LISTS CONTAIN NO ELEMENTS!' return -1. elif len(l1) != len(l2): print 'ERROR: LISTS HAVE TO HAVE THE SAME LENGTH!' return -1. l1 = rankdata(l1) l2 = rankdata(l2) l1_mean = sum(l1)/len(l1) l2_mean = sum(l2)/len(l2) sum1 = 0. sum2 = 0. numerator = 0. # Compute Spearman rs for i in range(0,len(l1)): numerator +=(l1[i] - l1_mean)*(l2[i] - l2_mean) sum1 += (l1[i] - l1_mean)**2 sum2 += (l2[i] - l2_mean)**2 denum = sqrt(sum1)*sqrt(sum2) rs = numerator/denum # Compute Spearman t t = len(l1) - 2. t /= 1. - rs**2 t = rs*sqrt(t) # if t > 0: change sign, since student's t is axis symmetric around zero if t>0: t_help = (-1.)*t else: t_help = t #p = stdtr(len(z)-2.,t_help) p = stdtr(len(l1)-2.,t_help) return (rs,p)
def welch_test( x1_stats, x2_stats): x1bar, x2bar, v1, v2, n1, n2 = x1_stats[0], x2_stats[0], x1_stats[1]**2, x2_stats[1]**2, x1_stats[2], x2_stats[2] # Compute Welch's t-test using the descriptive statistics. tf = (x1bar - x2bar) / np.sqrt(v1/n1 + v2/n2) dof = (v1/n1 + v2/n2)**2 / (v1**2/(n1**2*(n1-1)) + v2**2/(n2**2*(n2-1))) pf = 2*stdtr(dof, -np.abs(tf)) return float(pf)
def one_sample_t(A,mu): n = len(A) df = n - 1 z = np.mean(A) - mu z /= unbiased_std(A) t = z * np.sqrt(n) return t, stdtr(df,t)
def two_sample_t(A,B,expected_diff=0): diff = (np.mean(A) - np.mean(B) - expected_diff) na = len(A) nb = len(B) df = na + nb - 2 sum_sq = (var(A)*(na-1) + var(B)*(nb-1)) f = (1/na + 1/nb)/df t = diff/np.sqrt(sum_sq*f) return (t, stdtr(df,t))
def two_sample_t_test_welch(a,b): ''' Welch t-test''' from scipy.special import stdtr abar = a.mean() avar = a.var(ddof=1) na = a.size adof = na - 1 bbar = b.mean() bvar = b.var(ddof=1) nb = b.size bdof = nb - 1 # Compute Welch's t-test using the descriptive statistics. tf = (abar - bbar) / np.sqrt(avar/na + bvar/nb) dof = (avar/na + bvar/nb)**2 / (avar**2/(na**2*adof) + bvar**2/(nb**2*bdof)) pf = stdtr(dof, -np.abs(tf)) return (tf, pf)
def ttest(self): """ ttest implementation that uses efficient variance computation """ abar = self.a_estimator.mean() bbar = self.b_estimator.mean() na = self.a_estimator.num_samples() adof = na - 1 nb = self.b_estimator.num_samples() bdof = nb - 1 avar = self.a_estimator.var() bvar = self.b_estimator.var() tf = (abar - bbar) / np.sqrt(avar/na + bvar/nb) dof = (avar / na + bvar / nb) ** 2 / (avar ** 2 / (na ** 2 * adof) + bvar ** 2 / (nb ** 2 * bdof)) pf = 2*stdtr(dof, -np.abs(tf)) return pf
def t_test_manual(l1,l2): l1 = np.asarray(l1) l2 = np.asarray(l2) l1bar = l1.mean() l2bar = l2.mean() l1var = l1.var(ddof=1) #why degree of freedom is 1 here? l2var = l2.var(ddof=1) n_l1 = l1.size n_l2 = l2.size df_l1 = n_l1 - 1 df_l2 = n_l2 - 1 #use des stat to calculate Welch's t-test tf = (l1bar - l2bar) / np.sqrt(l1var/n_l1+l2var/n_l2) dof = (l1var/n_l1 + l2var/n_l2)**2 / (l1var**2/(n_l1**2*df_l1) + l2var**2/(n_l2**2*df_l2)) pf = 2*stdtr(dof,-np.abs(tf)) return tf, pf
def character_stats(self): for char, cngrams in self.character_ngrams.items(): for words, cn in cngrams.items(): n = len(words) count_all = self.ngrams[words]['count'] count_char = cn['count'] # bernoulli! char_total = float(self.character_ngram_totals[char][n]) cn['freq'] = char_p = count_char / char_total other_total = self.ngram_totals[n] - char_total cn['other_freq'] = other_p = (count_all - count_char) / other_total if count_all == count_char: p_value = 0.0 # only this character ever says it! (also would cause /0) else: if char_total == 1.0: # special case to avoid divide by zero nu, t = size1special(char_p, other_p, other_p*(1.0-other_p), other_total) else: nu, t = welch(char_p, char_p*(1.0-char_p), char_total, other_p, other_p*(1.0-other_p), other_total) p_value = 1.0 - stdtr(nu, t) cn['p_value'] = p_value
def cdf(self, x): """ Computes the cumulative distribution function of they distribution at the point(s) x. The cdf is defined as follows: F(x| nu) = 1 - 1 / 2 * I_x(t) *(nu / 2, 1 / 2) where x(t) = nu / (t ** 2 + u) and I_x is the regularized incomplete beta function. Parameters ---------- x: array, dtype=float, shape=(m x n) The value(s) at which the user would like the cdf evaluated. If an array is passed in, the cdf is evaluated at every point in the array and an array of the same size is returned. Returns ------- cdf: array, dtype=float, shape=(m x n) The cdf at each point in x. """ cdf = stdtr(self.nu, x) return cdf
cholsigmainv = np.linalg.cholesky(np.linalg.inv(np.cov(screens.T))) warped_screens = screens.values @ cholsigmainv warped_intercept = cholsigmainv.sum(axis=0) # Then just run linear regression; this implementation is based on # https://pingouin-stats.org/generated/pingouin.linear_regression.html def linear_regression(warped_screens, warped_intercept): GLS_coef = np.empty((len(warped_screens), len(warped_screens))) GLS_se = np.empty((len(warped_screens), len(warped_screens))) ys = warped_screens.T for gene_index in range(len(warped_screens)): X = np.stack((warped_intercept, warped_screens[gene_index]), axis=1) coef, residues = np.linalg.lstsq(X, ys, rcond=None)[:2] df = warped_screens.shape[1] - 2 GLS_coef[gene_index] = coef[1] GLS_se[gene_index] = \ np.sqrt(np.linalg.pinv(X.T @ X)[1, 1] * residues / df) return GLS_coef, GLS_se GLS_coef, GLS_se = linear_regression(warped_screens, warped_intercept) df = warped_screens.shape[1] - 2 GLS_p = 2 * stdtr(df, -np.abs(GLS_coef / GLS_se)) np.fill_diagonal(GLS_p, 1) # Save everything np.save('GLS_p.npy', GLS_p) np.save('GLS_sign.npy', np.sign(GLS_coef)) screens.index.to_series().to_csv('genes.txt', index=False, header=False)
def _cdf(self, x, df, C, Ci): out = special.stdtr(df, numpy.dot(Ci, special.stdtrit(df, x))) return out
# Create sample data. a = df0.outcome b = df1.outcome # Use scipy.stats.ttest_ind. t, p = ttest_ind(a, b, equal_var=False) print("ttest_ind: t = %g p = %g" % (t, p)) results = ("ttest_ind: t = %g p = %g" % (t, p)) # Compute the descriptive statistics of a and b. abar = a.mean() avar = a.var(ddof=1) na = a.size adof = na - 1 bbar = b.mean() bvar = b.var(ddof=1) nb = b.size bdof = nb - 1 # Use scipy.stats.ttest_ind_from_stats. t2, p2 = ttest_ind_from_stats(abar, np.sqrt(avar), na, bbar, np.sqrt(bvar), nb, equal_var=False) print("ttest_ind_from_stats: t = %g p = %g" % (t2, p2)) # Use the formulas directly. tf = (abar - bbar) / np.sqrt(avar/na + bvar/nb) dof = (avar/na + bvar/nb)**2 / (avar**2/(na**2*adof) + bvar**2/(nb**2*bdof)) pf = 2*stdtr(dof, -np.abs(tf))
def one_sample_t(A,mu): n = len(A) df = n-1 z = (np.mean(A) - mu) / std(A) t = z * np.sqrt(n) return t, stdtr(df,t)
def _cdf(self, x, a, C, Ci, loc): x = numpy.dot(Ci, (x.T-loc.T).T) return special.stdtr(a, x)
def _cdft(x,df): return special.stdtr(df, x)
def _ppf(self, q, df, C, Ci): out = special.stdtr(df, numpy.dot(C, special.stdtrit(df, q))) return out
def _cdft(x, df): return special.stdtr(df, x) # pylint: disable=no-member
def _pdf(self, x, df, alpha): # 2*normpdf(x)*normcdf(alpha*x) return 2.0 * distributions.t._pdf(x, df) * special.stdtr(df + 1, alpha * x * np.sqrt((1 + df) / (x ** 2 + df)))
def _cdf(self, x, a): return special.stdtr(a, x)
def conll_to_contexts(self, conll_file, ctxt_out_file, \ ctxt_type="syntactic", ctxt_dir="up,down", \ pterm_min_freq=1000, ctxt_min_freq=1000, \ pterm_pos="ADJ,ADV,NC,V,VIMP,VINF,VPP,VPR,VS", \ pterm_cpos="ADJ,ADV,NC,V,VIMP,VINF,VPP,VPR,VS", \ pterm_use_lem="ADJ,ADV,NC,V,VIMP,VINF,VPP,VPR,VS", \ sterm_pos="ADJ,ADV,NC,V,VIMP,VINF,VPP,VPR,VS", \ sterm_cpos="ADJ,ADV,NC,V,VIMP,VINF,VPP,VPR,VS", \ sterm_use_lem="ADJ,ADV,NC,V,VIMP,VINF,VPP,VPR,VS", \ skip_pos="P,P+D,CC,CS", skip_cpos="", \ skip_use_lem="P,P+D,CC,CS", skip_only=False, \ use_deplabel=True, weight_fun="pmi"): # Read in data. print >> sys.stderr, "Extracting context relations from CONLL..." t0 = time.time() pterm_min_freq = int(pterm_min_freq) ctxt_min_freq = int(ctxt_min_freq) ctxt_type_set = set(ctxt_type.split(",")) ctxt_dir_set = set(ctxt_dir.split(",")) pterm_pos_set = set(pterm_pos.split(",")) pterm_cpos_set = set(pterm_cpos.split(",")) pterm_use_lem_set = set(pterm_use_lem.split(",")) sterm_pos_set = set(sterm_pos.split(",")) sterm_cpos_set = set(sterm_cpos.split(",")) sterm_use_lem_set = set(sterm_use_lem.split(",")) skip_pos_set = set(skip_pos.split(",")) skip_cpos_set = set(skip_cpos.split(",")) skip_use_lem_set = set(skip_use_lem.split(",")) sent = [()] # (LEMMA, CPS, FPS, HEAD, LABEL) pterm_cnt = {} # PTERM -> COUNT prel_cnt = {} # (PTERM, REL) -> COUNT crel_cnt = {} # PTERM -> (REL, STERM) -> COUNT ctxt_cnt = {} # (REL, STERM) -> COUNT rel_cnt = {} # REL -> COUNT sterm_cnt = {} # STERM -> COUNT tot_cnt = 0 conll_f = codecs.open(conll_file, 'r', ENCODING) for _,sent in read_conll(conll_f, mode="extract"): # Extract context relations from a full sent. for i in range(1, len(sent)): crels = [] dep = sent[i] deppos = dep[CPS] if dep[FPS] in pterm_cpos_set \ else dep[FPS] # Linear dependency context relations. if "linear" in ctxt_type_set: prv = sent[i-1] if i > 1 else None nxt = sent[i+1] if i < len(sent)-1 else None # Store previous token relation. if prv and \ "prev" in ctxt_dir_set and \ dep[FPS] in pterm_pos_set and \ prv[FPS] in sterm_pos_set: prvpos = prv[CPS] if prv[FPS] in sterm_cpos_set \ else prv[FPS] rel = tuple(["*p*"]) pterm = deppos,"<"+deppos+">" if dep[FPS] in pterm_use_lem_set: pterm = deppos,dep[LEM] # Store up to two relations, depending on sterm lex sterm = prvpos,"<"+prvpos+">" crels.append((pterm,sterm,rel)) if prv[FPS] in sterm_use_lem_set: sterm = prvpos,prv[LEM] crels.append((pterm,sterm,rel)) # Store next token relation. if nxt and \ "next" in ctxt_dir_set and \ dep[FPS] in pterm_pos_set and \ nxt[FPS] in sterm_pos_set: nxtpos = nxt[CPS] if nxt[FPS] in sterm_cpos_set \ else nxt[FPS] rel = tuple(["*n*"]) pterm = deppos,"<"+deppos+">" if dep[FPS] in pterm_use_lem_set: pterm = deppos,dep[LEM] # Store up to two relations, depending on sterm lex sterm = nxtpos,"<"+nxtpos+">" crels.append((pterm,sterm,rel)) if nxt[FPS] in sterm_use_lem_set: sterm = nxtpos,nxt[LEM] crels.append((pterm,sterm,rel)) # Syntactic dependency context relations. if "syntactic" in ctxt_type_set: gov = sent[dep[GOV]] path = [] if use_deplabel: path.append(dep[LAB]) # Skip at most one time to next governor up. skipped = False if len(gov) > 0 and gov[FPS] in skip_pos_set: govpos = gov[CPS] if gov[FPS] in skip_cpos else \ gov[FPS] if gov[FPS] in skip_use_lem_set: path.append(govpos+"|"+gov[LEM]) else: path.append(govpos) if use_deplabel: path.append(gov[LAB]) gov = sent[gov[GOV]] if len(gov) > 0: if gov[FPS] in skip_pos_set: # Can't skip twice gov = [] else: skipped = True if len(gov) > 0 and (skipped or not skip_only): # Store upward relation. if "up" in ctxt_dir_set and \ dep[FPS] in pterm_pos_set and \ gov[FPS] in sterm_pos_set: govpos = gov[CPS] if gov[FPS] in sterm_cpos_set \ else gov[FPS] rel = tuple(["*u*"] + path) pterm = deppos,"<"+deppos+">" if dep[FPS] in pterm_use_lem_set: pterm = deppos,dep[LEM] # Store up to two relations, depending on sterm lex sterm = govpos,"<"+govpos+">" crels.append((pterm,sterm,rel)) if gov[FPS] in sterm_use_lem_set: sterm = govpos,gov[LEM] crels.append((pterm,sterm,rel)) # Store downward relation. if "down" in ctxt_dir_set and \ gov[FPS] in pterm_pos_set and \ dep[FPS] in sterm_pos_set: govpos = gov[CPS] if gov[FPS] in pterm_cpos_set \ else gov[FPS] path.reverse() rel = tuple(["*d*"] + path) pterm = govpos,"<"+govpos+">" if gov[FPS] in pterm_use_lem_set: pterm = govpos,gov[LEM] # Store up to two relations, depending on sterm lex sterm = deppos,"<"+deppos+">" crels.append((pterm,sterm,rel)) if dep[FPS] in sterm_use_lem_set: sterm = deppos,dep[LEM] crels.append((pterm,sterm,rel)) # Store relevant pterm, context, context relation counts. for pterm,sterm,rel in crels: ctxt = rel, sterm prel = pterm, rel crel = pterm, rel, sterm pterm_cnt[pterm] = pterm_cnt.get(pterm, 0) + 1 ctxt_cnt[ctxt] = ctxt_cnt.get(ctxt, 0) + 1 if pterm not in crel_cnt: crel_cnt[pterm] = {} crel_cnt[pterm][ctxt] = crel_cnt[pterm].get(ctxt, 0) + 1 tot_cnt += 1 sent = [()] conll_f.close() # print >> sys.stderr, "# crel occurrences:", tot_cnt # Retain and weight frequent pterm and ctxt only fctxt = open(ctxt_out_file, "wb") # Store vocabularies id_to_pterm = [] unk_pos = {} cnt = 0 for pterm in sorted(pterm_cnt.keys()): pterm_pos, pterm_lem = pterm if pterm_pos.startswith("V") and pterm_lem in STOP: del pterm_cnt[pterm] del crel_cnt[pterm] continue if pterm_cnt[pterm] >= pterm_min_freq: id_to_pterm.append(pterm) cnt += 1 else: unk = (pterm_pos, "<UNK>") if pterm_pos not in unk_pos: unk_pos[pterm_pos] = True pterm_cnt[unk] = 0 crel_cnt[unk] = {} id_to_pterm.append(unk) cnt += 1 pterm_cnt[unk] += pterm_cnt[pterm] for ctxt in crel_cnt[pterm].keys(): crel_cnt[unk][ctxt] = crel_cnt[unk].get(ctxt, 0) + \ crel_cnt[pterm][ctxt] del pterm_cnt[pterm] del crel_cnt[pterm] id_to_pterm = tuple(id_to_pterm) print >> sys.stderr, "# pterm found:", cnt id_to_ctxt = [] cnt = 0 for ctxt in sorted(ctxt_cnt.keys()): if ctxt_cnt[ctxt] >= ctxt_min_freq: id_to_ctxt.append(ctxt) cnt += 1 else: del ctxt_cnt[ctxt] id_to_ctxt = tuple(id_to_ctxt) print >> sys.stderr, "# ctxt found:", cnt cPickle.dump(id_to_pterm, fctxt, -1) cPickle.dump(id_to_ctxt, fctxt, -1) # Reusable extremum weights, given the total count n n = tot_cnt n2 = n**2 # PMI min: c1=c2=n/2, c12=1 pmi_min = log(4.0/n, 2) # PMI max: c1=pterm_min_freq, c2=ctxt_min_freq, c12=min(c1,c2) maxmin_cut = float(max(pterm_min_freq, ctxt_min_freq)) pmi_max = log(n/maxmin_cut, 2) # LRATIO min: useful only if p1 < p2, so min is 0 lratio_min = 0.0 # LRATIO max: c1=c2=c12=n/x, where x is optimal divisor x = 3.9215536345675 x2 = x**2 lratio_max = -2*((n/x)*log(1/x2) + (n-n/x)*log(1-1/x2) - \ (n/x)*log(1/x) - (n-n/x)*log(1-1/x)) for ptermid in xrange(len(id_to_pterm)): curvector = [None]*len(id_to_ctxt) pterm = id_to_pterm[ptermid] for ctxtid in xrange(len(id_to_ctxt)): ctxt = id_to_ctxt[ctxtid] # Quick values for math c12 = crel_cnt[pterm].get(ctxt, 0) c1 = pterm_cnt[pterm] c2 = ctxt_cnt[ctxt] p1 = float(c1 * c2) / n2 # Null hypothesis p2 = float(c12) / n # Alternative hypothesis cont = [[c12,c2-c12],[c1-c12,n+c12-c1-c2]] if weight_fun == "relfreq": # [0,1] proportion wgt = float(c12) / c1 elif weight_fun == "chisq": # [-1,1] p-value _,pval,_,_ = stats.chi2_contingency(cont) wgt = 1 - pval if p1 < p2 else pval - 1 elif weight_fun == "ttest": # [-1,1] p-value wgt = -1 if c12 > 0: tval = (c12 - float(c1*c2)/n) /\ sqrt(c12 * (1 - float(c12/n))) if tval < 0: wgt = -1 + 2*special.stdtr(n, tval) else: wgt = 1 - 2*special.stdtr(n, -tval) elif weight_fun == "binom": # [-1,1] Exact one-sided test wgt = 0.0 if p1 < p2: # implied that c12 > 0 wgt = stats.binom.cdf(c12-1,n,p1) else: wgt = stats.binom.cdf(c12,n,p1) - 1 elif weight_fun == "pmi": # [-1,1] information # Transform from [pmi_min,pmi_max] wgt = pmi_min #wgt = -1 if c12 > 0: wgt = log(float(c12 * n) / (c1 * c2), 2) #if pmi < 0: # wgt = -pmi / pmi_min #else: # wgt = pmi / pmi_max elif weight_fun == "lratio": # [0, 1] # Transform from [0, lratio_max] wgt = lratio_min if p1 < p2: wgt = -2*(c12*log(p1) + (n-c12)*log(1-p1) - \ c12*log(p2) - (n-c12)*log(1-p2)) wgt /= lratio_max curvector[ctxtid] = wgt cPickle.dump(curvector, fctxt, -1) fctxt.close() print >> sys.stderr, "Done in %s sec." %(time.time()-t0)
def _pdf(self, x, df, alpha): # 2*normpdf(x)*normcdf(alpha*x) return 2.0*distributions.t._pdf(x, df) * special.stdtr(df+1, alpha*x*np.sqrt((1+df)/(x**2+df)))
def one_sample_t(A, mu): n = len(A) df = n - 1 z = (np.mean(A) - mu) / std(A) t = z * np.sqrt(n) return t, stdtr(df, t)
def _cdf(self, x, a, C, Ci, loc): x = np.dot(Ci, (x.T-loc.T).T) return special.stdtr(a, x)
def _cdft(x, df): return special.stdtr(df, x)
for i in range(m + 1, len(accuracy_df.index)): mean1 = accuracy_df["2. Average Accuracy"][m] mean2 = accuracy_df["2. Average Accuracy"][i] N1 = n N2 = n sample_std1 = accuracy_df["3. Accuracy Standard Deviation"][m] sample_variance1 = (sample_std1) ** 2 sample_std2 = accuracy_df["3. Accuracy Standard Deviation"][i] sample_variance2 = (sample_std2) ** 2 if mean1 > mean2: T_numerator = mean1 - mean2 else: T_numerator = mean2 - mean1 T_denominator = math.sqrt(sample_variance1 / N1 + sample_variance2 / N2) T = T_numerator / T_denominator deg_fre_numerator = (sample_variance1 / N1 + sample_variance2 / N2) ** 2 deg_fre_denominator = (((sample_variance1) / N1) ** 2) / (N1 - 1) + (((sample_variance2) / N2) ** 2) / (N2 - 1) deg_fre = deg_fre_numerator / deg_fre_denominator pf = 2 * stdtr(deg_fre, -np.abs(T)) print(t_test_df[t_test_df.columns.values[0]]) t_test_df[t_test_df.columns.values[m + 1]][i] = pf writer = pd.ExcelWriter("accuracy_t_test.xlsx", engine="xlsxwriter") t_test_df.to_excel(writer) writer.save()