def different_stdev_explicite(self, alpha, y1, y2, S1, S2, n1, n2): t0 = (y1 - y2) / (np.sqrt(S1**2 / n1 + S2**2 / n2)) # hypothesis testing2 df = int((S1**2 / n1 + S2**2 / n2)**2 / ((S1**2 / n1)**2 / (n1 - 1) + (S2**2 / n2)**2 / (n2 - 1))) H1a = t.ppf(1 - alpha / 2., df) < np.abs(t0) H1b = t.ppf(1 - alpha, df) < t0 H1c = t.ppf(alpha, df) > t0 # p-value p1a = t.sf(np.abs(t0), df) * 2 p1b = t.sf(t0, df) p1c = t.cdf(t0, df) c1 = y1 - y2 - t.ppf(1 - alpha / 2., df) * np.sqrt(S1**2 / n1 + S2**2 / n2) c2 = y1 - y2 + t.ppf(1 - alpha / 2., df) * np.sqrt(S1**2 / n1 + S2**2 / n2) CI = (c1, c2) print 'at the level of significance ', alpha, ':' print 'H1 mu1 != mu2 is ', H1a print 'H1 mu1 > mu2 is ', H1b print 'H1 mu1 < mu2 is ', H1c print 'probability of type I error for mu1 != mu2:', p1a print 'probability of type I error for mu1 > mu2:', p1b print 'probability of type I error for mu1 < mu2:', p1c print 'CI (%.1f%%) for mu1 - mu2:' % (100 - 100 * alpha), CI, CI / y1
def ttest_1samp(self, a, popmean): if (len(a) == 0): return [None, None] if (len(a) == 1): return [None, None] #cal avg avg = 0.0 for x in a: avg += x avg = avg / len(a) S = 0.0 for x in a: S += (x - avg)**2 S = (S / (len(a) - 1))**0.5 print S if (S == 0): return [None, None] tvalue = (avg - popmean) / (S / (len(a)**0.5)) if (tvalue >= 0): p = t.sf(x=tvalue, df=len(a) - 1) * 2 return [tvalue, p] else: p = 2 * t.sf(x=-tvalue, df=len(a) - 1) return [tvalue, p]
def p_valut_t(x_bar, mu, s, n, how): """ 计算sigma未知情况下的p-值 总体均值假设检验,当sigma未知的情况下计算p-value Params ------ x_bar: 样本均值 mu: 总体均值,即目标值 s: 样本方差 n: 样本容量 how: 假设检验方法,可选择 ( 'up', 'down', 'double' ) Return ------ (检验统计量, p-值) """ t_dist = t(n-1) t_val = (x_bar - mu) / (s / np.sqrt(n)) if how == 'up': p = t.sf(z) elif how == 'down': p = t.cdf(z) elif how == 'double': p = t.sf(abs(z)) * 2 else: pass return t_val, p
def ttest_1samp(self,a,popmean): if(len(a)==0): return [None,None] if(len(a)==1): return [None,None] #cal avg avg=0.0 for x in a: avg+=x avg=avg/len(a) S=0.0 for x in a: S+=(x-avg)**2 S=(S/(len(a)-1))**0.5 print S if(S==0): return [None,None] tvalue=(avg-popmean)/(S/(len(a)**0.5)) if(tvalue>=0): p=t.sf(x=tvalue,df=len(a)-1)*2 return [tvalue,p] else: p=2*t.sf(x=-tvalue,df=len(a)-1) return [tvalue,p]
def regression_analysis(self, key, info): ''' Calculates all the values we will need for simple linear regression analysis, and does the analysis itself. ''' # not the most efficient, but we want to keep these values # to calculate standard errors info = list(info) # calculate sums sumx, sumy, sumxx, sumyy, sumxy, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0) for (x, y) in info: sumx += x sumy += y sumxx += x * x sumyy += y * y sumxy += x * y n += 1 # calculate correlation corr = 0 corr_denom = math.sqrt((n * sumxx - sumx**2) * (n * sumyy - sumy**2)) if corr_denom < 0.0001: yield False, "Could not calculate coefficients" corr_num = n * sumxy - sumx * sumy corr = corr_num / corr_denom if abs(corr) < 0.0001: yield False, "Could not calculate coefficients" # calculate regression coefficients beta1 = (sumxy - sumx * sumy / n) / (sumxx - sumx**2 / n) beta0 = (sumy - beta1 * sumx) / n # calculate standard errors y_reals = [y for (x, y) in info] y_hats = [beta0 + beta1 * y for y in y_reals] s_num = sum([(y - yhat) for (y, yhat) in zip(y_reals, y_hats)]) s = math.sqrt(s_num / (n - 2)) se_denom = n * sumxx - sumx**2 se_beta0 = s * math.sqrt(sumxx / se_denom) se_beta1 = s * math.sqrt(n / se_denom) # calculate t-values t0 = beta0 / se_beta0 t1 = beta1 / se_beta1 # calculate 2-sided p-values alpha = 0.05 t_stat = t.ppf(1 - alpha/2, n - 2) beta0_p_value = t.sf(abs(t0), n - 2) * 2 beta1_p_value = t.sf(abs(t1), n - 2) * 2 # output most important values in a human-readable format print("Correlation: {}".format(corr)) print("Beta 0: {}, p-value: {}".format(beta0, beta0_p_value)) print("Beta 1: {}, p-value: {}".format(beta1, beta1_p_value))
def _correl_pvalue(r, n, k=0, alternative="two-sided"): """Compute the p-value of a correlation coefficient. https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#Using_the_exact_distribution See also scipy.stats._ttest_finish Parameters ---------- r : float Correlation coefficient. n : int Sample size k : int Number of covariates for (semi)-partial correlation. alternative : string Tail of the test. Returns ------- pval : float p-value. Notes ----- This uses the same approach as :py:func:`scipy.stats.pearsonr` to calculate the p-value (i.e. using a beta distribution) """ from scipy.stats import t assert alternative in [ 'two-sided', 'greater', 'less' ], ("Alternative must be one of 'two-sided' (default), 'greater' or 'less'." ) # Method 1: using a student T distribution dof = n - k - 2 tval = r * np.sqrt(dof / (1 - r**2)) if alternative == 'less': pval = t.cdf(tval, dof) elif alternative == 'greater': pval = t.sf(tval, dof) elif alternative == 'two-sided': pval = 2 * t.sf(np.abs(tval), dof) # Method 2: beta distribution (similar to scipy.stats.pearsonr, faster) # from scipy.special import btdtr # ab = (n - k) / 2 - 1 # pval = 2 * btdtr(ab, ab, 0.5 * (1 - abs(np.float64(r)))) return pval
def kramers_v(x, y, bias_correction=True): """Calculates Cramer's V statistic for categorical-categorical association. Taken from https://github.com/shakedzy/dython/blob/master/dython/nominal.py Inspired by Shaked Zychlinski. This is a symmetric coefficient: V(x,y) = V(y,x) Original function taken from: https://stackoverflow.com/a/46498792/5863503 Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V Parameters: ----------- x : list / NumPy ndarray / Pandas Series A sequence of categorical measurements y : list / NumPy ndarray / Pandas Series A sequence of categorical measurements bias_correction : Boolean, default = True Use bias correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328. Returns: -------- float in the range of [0,1] """ confusion_matrix = crosstab(x, y) c2 = chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum().sum() phi2 = c2 / n r, k = confusion_matrix.shape if bias_correction: phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) rcorr = r - ((r - 1) ** 2) / (n - 1) kcorr = k - ((k - 1) ** 2) / (n - 1) if min((kcorr - 1), (rcorr - 1)) == 0: warnings.warn( "Unable to calculate Cramer's V using bias correction. Consider using bias_correction=False", RuntimeWarning) return np.nan else: V = np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))) # calculate p-value using V tval = t.isf(0.975, n-3) return V, t.sf(abs(tval), n-2) else: V = np.sqrt(phi2 / min(k - 1, r - 1)) tval = t.isf(0.975, n-3) return V, t.sf(abs(tval), n-2)
def pearsonr(self, x, y): n = len(x) if(n == 0): return [None, None] sum_x1 = 0 sum_x2 = 0 for i in x: sum_x1 += float(i) sum_x2 += float(i)**2 sum_y1 = 0 sum_y2 = 0 for i in y: sum_y1 += float(i) sum_y2 += float(i)**2 f1 = 0 for i in range(n): f1 += float(x[i])*float(y[i]) f1 = f1 * n f1 = f1 - sum_x1 * sum_y1 f21 = (n * sum_x2-sum_x1**2)**0.5 f22 = (n * sum_y2-sum_y1**2)**0.5 f2 = f21 * f22 r = f1 / f2 r = round(r, 6) if(r == 1 or r == -1): p = 0 else: T = r * ((n-2)/(1-r**2))**0.5 p = t.sf(abs(T), (n-2)) * 2 p = round(p, 6) print p return [r, p]
def dunnetts_post_hoc(X0, X, alpha): Y = [X0, *X] p = len(X) N_i = [len(y) for y in Y] # s^2 = Sum(Sum((X_ij - |X|)^2))/n #n = sum(N_i) - (p+1) n = np.sum(N_i) - (p + 1) # degrees of freedom s_num = np.sum([np.power([y - np.mean(x) for y in x], 2) for x in Y]) s = np.sqrt(s_num / n) N = [len(x) for x in X] m0 = np.mean(X0) N0 = len(X0) t_cv = t.ppf(1 - (alpha / 2), n) # get 2-tailed critical value from t-disitribution CI = [] P = [] for x, Ni in zip(X, N): mx = np.mean(x) A0 = t_cv * s * np.sqrt(1 / Ni + 1 / N0) Ai = np.abs(mx - m0) Ti = Ai / (s * np.sqrt(1 / Ni + 1 / N0)) Pi = t.sf(Ti, n) P.append(Pi) CI.append((Ai - A0, Ai + A0)) return CI, P
def main(feature_set): coef_list = [] for iteration in range(MAX_ITERATIONS): print 'iteration: %d\r' % (iteration + 1), x_train, x_test, y_train, y_test = get_regression_dataset( 0.6, feature_set=feature_set) # x_train, x_test = x_train[feature_set], x_test[feature_set] lr = LinearRegression() lr.fit(x_train, y_train) coef_list.append(lr.coef_) coef_list = np.array(coef_list) se = np.std(coef_list, 0) / np.sqrt(MAX_ITERATIONS) t = np.mean(coef_list, 0) / se pvalue = t_table.sf(np.fabs(t), len(t) - 1) * 2 coef_list = np.mean(coef_list, 0) print '\n\n{:25s} {:s} {:s} {:s} {:s}'.format( 'Field', 'COEF', 'Standard Error', 't-Statistics', 'P-value') print '================================================================================' for values in zip(feature_set, coef_list, se, t, pvalue): print '{:25s} {:3.4f} \t {:3.4f} \t {:3.4f} \t {:3.6f}'.format( *values) print '\n' print_errors(lr, x_train, y_train.values, x_test, y_test.values, msg='Full Features')
def t_equal_var(n1, m1, var1, n2, m2, var2): temp = ((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2) * (1 / n1 + 1 / n2) _t = (m1 - m2) / np.sqrt(temp) _v = n1 + n2 - 2 _p = t.sf(_t, _v) return _t, _v, _p
def compute_corrected_ttest(differences, df, n_train, n_test): """Computes right-tailed paired t-test with corrected variance. Parameters ---------- differences : array-like of shape (n_samples, 1) Vector containing the differences in the score metrics of two models. df : int Degrees of freedom. n_train : int Number of samples in the training set. n_test : int Number of samples in the testing set. Returns ------- t_stat : float Variance-corrected t-statistic. p_val : float Variance-corrected p-value. """ mean = np.mean(differences) std = corrected_std(differences, n_train, n_test) t_stat = mean / std p_val = t.sf(np.abs(t_stat), df) # right-tailed t-test return t_stat, p_val
def get_correlation_parallel(s1, s2): """ params s1 - series 1 params s2 - series 2 NOTE : series are number 1 to 25 when giving in arguments returns the correlation between series """ start = time.time() offsets = [] #this will be the arguments to all the parallel jobs instances = (MAX_ROWS / BATCH_SIZE) mean, std = calculate_mean_std_parallel() stripped_mean, stripped_std = calculate_stripped_mean_std_parallel( mean, std) processes = Pool(processes=instances) for i in range(instances): offsets.append( (s1, s2, mean, std, stripped_mean, stripped_std, i * BATCH_SIZE)) results = processes.map(get_correlation, offsets) processes.close() processes.join() pearson_corr = 0 total = 0 for result in results: pearson_corr += result[0] * result[1] total += result[1] pearson_corr = 1.0 * pearson_corr / total t_value = abs(pearson_corr * math.sqrt(1.0 * (total - 2) / (1 - (pearson_corr * pearson_corr)))) p_value = t.sf(t_value, total - 2) print "\n ######### CORRELATION BETWEEN SERIES ", s1, " AND SERIES ", s2, " is ", pearson_corr, "t value is ", t_value, " and p value is ", p_value, "######### \n" end = time.time() print "EXECUTION TIME : ", end - start, " sec" return pearson_corr
def wrapper(*args, **kwargs) -> Tuple[float, float]: sample_dist = func(*args, **kwargs) estimate = sample_dist.mean() std_err_estimate = sample_dist.std() n_samples = len(sample_dist) return estimate, 2 * t.sf( x=abs(estimate), df=n_samples - 2, loc=0, scale=std_err_estimate)
def ttest_1samp(self, a, popmean): n = len(a) mean = self.mean(a) t = (mean-popmean)/(self.stan_de(a, mean)/(n**0.5)) p = 2*T.sf(abs(t),n-1) return [round(t,6),round(p,6)]
def pearsonr(self, x, y): n = len(x) if (n == 0): return [None, None] sum_x = sum(x) sum_y = sum(y) sum_xy = 0.0 sum_x2 = 0.0 sum_y2 = 0.0 for x, y in zip(x, y): sum_xy += x * y sum_x2 += x**2 sum_y2 += y**2 z = ((n * sum_x2 - (sum_x)**2) * (n * sum_y2 - (sum_y)**2))**0.5 if (z == 0): return [None, 0] r = (n * sum_xy - sum_x * sum_y) / z if (abs(r) == 1): return [r, 0] tvalue = r * ((n - 2) / (1 - r**2))**0.5 p = 2 * t.sf(x=abs(tvalue), df=n - 2) return (round(r, 6), round(p, 6))
def get_local_air_quality_comparison(self, city_str, tolerance=2.0): self.city_str = city_str token = "fe269bc83b983ff958090f5808afa12eed57f14f" req_data = get_request_data(self.base_url + self.city_str + "/?token=" + token) lat, lng = req_data['data']['city']['geo'] latlngbx = str(lat) + "," + str(lng) + "," + str( lat + tolerance) + "," + str(lng + tolerance) r = requests.get( "https://api.waqi.info/" + f"/map/bounds/?latlng={latlngbx}&token={token}").json() if len(r['data']) > 0: local_df = make_dataframe(r) air_quality_comp = { 'deviation': 'Not found', 'probability': 'Not found' } deviation = local_df[local_df['name'].str.contains( city_str)]['aqi'].mean() - local_df['aqi'].mean() if not np.isnan(deviation): air_quality_comp['deviation'] = deviation probability = one_samp_t_test( local_df[local_df['name'].str.contains(city_str)], deviation) probability = t.sf(np.abs(probability), local_df.count() - 1)[0] if not np.isnan(probability): air_quality_comp['probability'] = probability return air_quality_comp
def calculate_t_p_error_stats(self): self.rating_dict = {.05:"*", .01:"**", .001: "***"} results = self.estimates stat_sig_names = ["SE", "t-stat", "p-value"] for stat_name in stat_sig_names: results[stat_name] = np.nan #generate statistic for each variable for var in self.beta_names: #SE of coefficient is found in the diagonal of cov_matrix results.loc[var]["SE"] = self.cov_matrix[var][var] ** (1/2) #tstat = Coeff / SE results.loc[var]["t-stat"] = \ results["Coefficient"][var] / results["SE"][var] #p-value is estimated using a table that transforms t-value in refference to df results.loc[var]["p-value"] = np.round(t.sf(np.abs(results.\ loc[var]["t-stat"]),self.degrees_of_freedom + 1) * 2, 5) #values for signifiances will be blank unless p-value < .05 #pandas does not allow np.nan values or default blank strings to be replaced significance = ["" for i in range(len(self.beta_names))] for i in range(len(self.beta_names)): var = self.beta_names[i] for val in self.rating_dict: if results.loc[var]["p-value"] < val: significance[i] = self.rating_dict[val] print(var, self.rating_dict[val]) results["significance"] = significance
def calculate_t_p_error_stats(self): est = ["SE", "t-stat", "p-value", "p-rating"] rating_dict = {.001:"***", .01:"**", .05:"*"} for name in est: results = self.estimates results[name] = np.nan for var in self.beta_names: if name == "SE": # SE of coefficient is found in the diagonal of cov_matrix results.ix[var][name] = \ self.cov_matrix[var][var] ** (1/2) if name == "t-stat": # tstat = Coef / SE results.ix[var][name] = \ results.ix[var]["Coefficient"] / results.ix[var]["SE"] if name == "p-value": # p-values is estimated from location within a # distribution implied by the t-stat results.ix[var][name] = round(t.sf(\ np.abs(results.ix[var]["t-stat"]), self.degrees_of_freedom + 1) * 2, 5) if name == "p-rating": print(name) for val in rating_dict: if results.ix[var]["p-value"] < val: results[name][var] = rating_dict[val] break # if p-stat > .05, no break in for-loop, set val of "" results[name][var] = ""
def _p_value_raw(self): """Returns the raw p values.""" from scipy.stats import t result = [2 * t.sf(a, b) for a, b in zip(np.fabs(self._t_stat_raw), self._df_resid_raw)] return np.array(result)
def pearsonr(self, x, y): n = len(x) if (n == 0): return [None, None] else: sumX = 0 sumY = 0 sumX2 = 0 sumY2 = 0 sumX = self.getSum(x) sumX2 = self.getSum2(x) sumY = self.getSum(y) sumY2 = self.getSum2(y) xy = 0 for i in range(n): xy += float(x[i]) * float(y[i]) f1 = n * xy - sumX * sumY f21 = (n * sumX2 - sumX**2)**0.5 f22 = (n * sumY2 - sumY**2)**0.5 f2 = f21 * f22 if (f2 == 0): return [None, None] r = f1 / f2 r = round(r, 6) if (r == 1 or r == -1): p = 0 else: T = r * ((n - 2) / (1 - r**2))**0.5 p = t.sf(abs(T), n - 2) * 2 p = round(p, 6) return [r, p]
def pearsonr(self, x, y): sx = 0.0 sy = 0.0 sxy = 0.0 sxx = 0.0 syy = 0.0 if len(x) == 0 or len(y) == 0: return [None, None] if len(x) != len(y): return [None, None] n = len(x) for i in range(0, n): sx += x[i] sy += y[i] sxy += x[i] * y[i] sxx += x[i]**2 syy += y[i]**2 rxy = (n * sxy - sx * sy) / ((n * sxx - sx**2) * (n * syy - sy**2))**0.5 v = (1 - rxy**2) if v == 0: return [round(rxy, 6), 0.000000] t = rxy * (((n - 2) / (1 - rxy**2))**0.5) p = T.sf(t, n - 2) if p > 0.5: p = 1 - p else: p = 2 * p return [round(rxy, 6), round(p, 6)]
def pearsonr(self,x,y): n=len(x) if(n==0): return [None,None] sum_x=sum(x) sum_y=sum(y) sum_xy=0.0 sum_x2=0.0 sum_y2=0.0 for x,y in zip(x,y): sum_xy+=x*y sum_x2+=x**2 sum_y2+=y**2 z=((n*sum_x2-(sum_x)**2)*(n*sum_y2-(sum_y)**2))**0.5 if(z==0): return [None,0] r=(n*sum_xy-sum_x*sum_y)/z if(abs(r)==1): return [r,0] tvalue=r*((n-2)/(1-r**2))**0.5 p=2*t.sf(x=abs(tvalue),df=n-2) return (round(r,6),round(p,6))
def calculate_t_p_error_stats(self): results = self.estimates stat_sig_names = ["SE", "t-stat", "p-value"] # create space in data frame for SE, t, and p for stat_name in stat_sig_names: results[stat_name] = np.nan # generate statistic for each variable for var in self.beta_names: # SE ** 2 of coefficient is found in the diagonal of the cov_matrix results.loc[var]["SE"] = self.cov_matrix[var][var] ** (1/2) # t-stat = Coef / SE results.loc[var]["t-stat"] = \ results["Coefficient"][var] / results["SE"][var] # p-values is estimated using a table that transforms t-stat in # light of degrees of freedom # 2 is for 2 tail... # 5 is to round to 5 decimal places results.loc[var]["p-value"] = np.round( t.sf(np.abs(results.loc[var]["t-stat"]), self.degrees_of_freedom +1) * 2, 5) ratings = [.05, .01, .001] significance = ["" for name in self.beta_names] for i in range(len(self.beta_names)): var = self.beta_names[i] for rating in ratings: if results.loc[var]["p-value"] < rating: significance[i] = significance[i] + "*" results["significance"] = significance
def different_stdev(self, alpha): t0 = (self.y1 - self.y2) / (np.sqrt(self.S1**2/self.n1 + self.S2**2/self.n2)) # hypothesis testing2 n1, n2, y1, y2, S1, S2 = self.n1, self.n2, self.y1, self.y2, self.S1, self.S2 df = int((S1**2/n1+S2**2/n2)**2/((S1**2/n1)**2/(n1-1)+(S2**2/n2)**2/(n2-1))) H1a = t.ppf(1 - alpha/2., df) < np.abs(t0) H1b = t.ppf(1 - alpha, df) < t0 H1c = t.ppf(alpha, df) > t0 # p-value p1a = t.sf(np.abs(t0), df) * 2 p1b = t.sf(t0, df) p1c = t.cdf(t0, df) c1 = y1 - y2 - t.ppf(1 - alpha/2., df) * np.sqrt(S1**2/n1+S2**2/n2) c2 = y1 - y2 + t.ppf(1 - alpha/2., df) * np.sqrt(S1**2/n1+S2**2/n2) return H1a, H1b, H1c, p1a, p1b, p1c, (c1,c2)
def calculate_t_p_error_stats(self): ratings = [.05, .01, .001] results = self.estimates stat_sig_names = ["SE", "t-stat", "p-value"] # create space in data frame for SE, t, and p for stat_name in stat_sig_names: results[stat_name] = np.nan # generate statistic for each variable for var in self.beta_names: # SE ** 2 of coefficient is found in the diagonal of cov_matrix results.loc[var]["SE"] = self.cov_matrix[var][var]**(1 / 2) # t-stat = Coef / SE results.loc[var]["t-stat"] = \ results["Coefficient"][var] / results["SE"][var] # p-values is estimated using a table that transforms t-value in # light of degrees of freedom results.loc[var]["p-value"] = np.round(t.sf(np.abs(results.\ loc[var]["t-stat"]), self.degrees_of_freedom + 1) * 2, 5) # values for significances will be blank unless p-values < .05 # pandas does not allow np.nan values or default blank strings to # be replaced x-post significance = ["" for i in range(len(self.beta_names))] for i in range(len(self.beta_names)): var = self.beta_names[i] for val in ratings: if results.loc[var]["p-value"] < val: significance[i] = significance[i] + "*" results["signficance"] = significance
def get_correlation_parallel(s1,s2): """ params s1 - series 1 params s2 - series 2 NOTE : series are number 1 to 25 when giving in arguments returns the correlation between series """ start = time.time() offsets = [] #this will be the arguments to all the parallel jobs instances = (MAX_ROWS/BATCH_SIZE) mean,std = calculate_mean_std_parallel() stripped_mean,stripped_std = calculate_stripped_mean_std_parallel(mean,std) processes = Pool(processes=instances) for i in range(instances): offsets.append((s1,s2,mean,std,stripped_mean,stripped_std,i*BATCH_SIZE)) results = processes.map(get_correlation,offsets) processes.close() processes.join() pearson_corr = 0 total = 0 for result in results: pearson_corr += result[0]*result[1] total += result[1] pearson_corr = 1.0*pearson_corr / total t_value = abs(pearson_corr*math.sqrt( 1.0*(total - 2) / ( 1 - (pearson_corr*pearson_corr)))) p_value = t.sf(t_value,total-2) print "\n ######### CORRELATION BETWEEN SERIES ",s1," AND SERIES ",s2, " is ",pearson_corr , "t value is ", t_value ," and p value is ", p_value, "######### \n" end = time.time() print "EXECUTION TIME : ", end-start , " sec" return pearson_corr
def lag_linregress_3D(x, y, lagx=0, lagy=0): """ Input: Two xr.Datarrays of any dimensions with the first dim being time. Thus the input data could be a 1D time series, or for example, have three dimensions (time,lat,lon). Datasets can be provided in any order, but note that the regression slope and intercept will be calculated for y with respect to x. Output: Covariance, correlation, regression slope and intercept, p-value, and standard error on regression between the two datasets along their aligned time dimension. Lag values can be assigned to either of the data, with lagx shifting x, and lagy shifting y, with the specified lag amount. """ #1. Ensure that the data are properly alinged to each other. x, y = xr.align(x, y) #2. Add lag information if any, and shift the data accordingly if lagx != 0: # If x lags y by 1, x must be shifted 1 step backwards. # But as the 'zero-th' value is nonexistant, xr assigns it as invalid # (nan). Hence it needs to be dropped x = x.shift(time=-lagx).dropna(dim='time') # Next important step is to re-align the two datasets so that y adjusts # to the changed coordinates of x x, y = xr.align(x, y) if lagy != 0: y = y.shift(time=-lagy).dropna(dim='time') x, y = xr.align(x, y) #3. Compute data length, mean and standard deviation along time axis: n = y.notnull().sum(dim='time') xmean = x.mean(axis=0) ymean = y.mean(axis=0) xstd = x.std(axis=0) ystd = y.std(axis=0) #4. Compute covariance along time axis cov = np.sum((x - xmean) * (y - ymean), axis=0) / (n) #5. Compute correlation along time axis cor = cov / (xstd * ystd) #6. Compute regression slope and intercept: slope = cov / (xstd**2) intercept = ymean - xmean * slope #7. Compute P-value and standard error #Compute t-statistics tstats = cor * np.sqrt(n - 2) / np.sqrt(1 - cor**2) stderr = slope / tstats from scipy.stats import t pval = t.sf(tstats, n - 2) * 2 pval = xr.DataArray(pval, dims=cor.dims, coords=cor.coords) return cov, cor, slope, intercept, pval, stderr
def bicor(x, y, c=9): """ Biweight midcorrelation. Parameters ---------- x, y : array_like First and second set of observations. x and y must be independent. c : float Tuning constant for the biweight estimator (default = 9.0). Returns ------- r : float Correlation coefficient. pval : float Two-tailed p-value. Notes ----- This function will return (np.nan, np.nan) if mad(x) == 0 or mad(y) == 0. References ---------- https://en.wikipedia.org/wiki/Biweight_midcorrelation https://docs.astropy.org/en/stable/api/astropy.stats.biweight.biweight_midcovariance.html Langfelder, P., & Horvath, S. (2012). Fast R Functions for Robust Correlations and Hierarchical Clustering. Journal of Statistical Software, 46(11). https://www.ncbi.nlm.nih.gov/pubmed/23050260 """ from scipy.stats import t # Calculate median nx = x.size x_median = np.median(x) y_median = np.median(y) # Raw median absolute deviation x_mad = np.median(np.abs(x - x_median)) y_mad = np.median(np.abs(y - y_median)) if x_mad == 0 or y_mad == 0: # From Langfelder and Horvath 2012: # "Strictly speaking, a call to bicor in R should return a missing # value if mad(x) = 0 or mad(y) = 0." This avoids division by zero. return np.nan, np.nan # Calculate weights u = (x - x_median) / (c * x_mad) v = (y - y_median) / (c * y_mad) w_x = (1 - u**2)**2 * ((1 - np.abs(u)) > 0) w_y = (1 - v**2)**2 * ((1 - np.abs(v)) > 0) # Normalize x and y by weights x_norm = (x - x_median) * w_x y_norm = (y - y_median) * w_y denom = (np.sqrt((x_norm**2).sum()) * np.sqrt((y_norm**2).sum())) # Calculate r, t and two-sided p-value r = (x_norm * y_norm).sum() / denom tval = r * np.sqrt((nx - 2) / (1 - r**2)) pval = 2 * t.sf(abs(tval), nx - 2) return r, pval
def percbend(x, y, beta=0.2): """ Percentage bend correlation (Wilcox 1994). Parameters ---------- x, y : array_like First and second set of observations. x and y must be independent. beta : float Bending constant for omega (0 <= beta <= 0.5). Returns ------- r : float Percentage bend correlation coefficient. pval : float Two-tailed p-value. Notes ----- Code inspired by Matlab code from Cyril Pernet and Guillaume Rousselet. References ---------- .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient. Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395 .. [2] Pernet CR, Wilcox R, Rousselet GA. Robust Correlation Analyses: False Positive and Power Validation Using a New Open Source Matlab Toolbox. Frontiers in Psychology. 2012;3:606. doi:10.3389/fpsyg.2012.00606. """ X = np.column_stack((x, y)) nx = X.shape[0] M = np.tile(np.median(X, axis=0), nx).reshape(X.shape) W = np.sort(np.abs(X - M), axis=0) m = int((1 - beta) * nx) omega = W[m - 1, :] P = (X - M) / omega P[np.isinf(P)] = 0 P[np.isnan(P)] = 0 # Loop over columns a = np.zeros((2, nx)) for c in [0, 1]: psi = P[:, c] i1 = np.where(psi < -1)[0].size i2 = np.where(psi > 1)[0].size s = X[:, c].copy() s[np.where(psi < -1)[0]] = 0 s[np.where(psi > 1)[0]] = 0 pbos = (np.sum(s) + omega[c] * (i2 - i1)) / (s.size - i1 - i2) a[c] = (X[:, c] - pbos) / omega[c] # Bend a[a <= -1] = -1 a[a >= 1] = 1 # Get r, tval and pval a, b = a r = (a * b).sum() / np.sqrt((a ** 2).sum() * (b ** 2).sum()) tval = r * np.sqrt((nx - 2) / (1 - r ** 2)) pval = 2 * t.sf(abs(tval), nx - 2) return r, pval
def _p_values(self): """ Return the model's coefficient/parameter p-values. :return: Numpy array """ p_vals = [t.sf(abs(x), self.deg_of_freedom)*2 for x in self.t_statistics] return p_vals
def equal_stdev(self, alpha): n1, n2, y1, y2 = self.n1, self.n2, self.y1, self.y2 Sp = np.sqrt( ((n1 - 1)*self.S1**2 + (n2 - 1)*self.S2**2) / (n1 + n2 - 2) ) t0 = (y1 - y2) / (Sp * np.sqrt(1./n1 + 1./n2)) # hypothesis testing2 H1a = t.ppf(1 - alpha/2., n1 + n2 -2) < np.abs(t0) H1b = t.ppf(1 - alpha, n1 + n2 -2) < t0 H1c = t.ppf(alpha, n1 + n2 -2) > t0 # p-value p1a = t.sf(np.abs(t0), n1 + n2 -2) * 2 p1b = t.sf(t0, n1 + n2 -2) p1c = t.cdf(t0, n1 + n2 -2) c1 = y1 - y2 - t.ppf(1 - alpha/2., n1 + n2 -2) * Sp * np.sqrt(1./n1+1./n2) c2 = y1 - y2 + t.ppf(1 - alpha/2., n1 + n2 -2) * Sp * np.sqrt(1./n1+1./n2) return H1a, H1b, H1c, p1a, p1b, p1c, (c1,c2)
def _compute_pvalue(self): """Returns the p-value.""" if self.test_statistic_name == "t": if self.side is "less_than": return student_t.cdf(self.test_statistic, self.deg_of_freedom) elif self.side is "greater_than": return student_t.sf(self.test_statistic, self.deg_of_freedom) else: #side is "not_equal" return 2 * student_t.sf(abs(self.test_statistic), self.deg_of_freedom) elif self.test_statistic_name == "z": if self.side is "less_than": return norm.cdf(self.test_statistic) elif self.side is "greater_than": return norm.sf(self.test_statistic) else: #side is "not_equal" return 2 * norm.sf(abs(self.test_statistic))
def t_test(cat): p = cat['avg_x']*cat['cnt_x']+cat['avg_y']*cat['cnt_y'] p = p/(cat['cnt_x']+cat['cnt_y']) p += 1e-8 z = (cat['avg_x']-cat['avg_y']) / np.sqrt((p*(1-p)*(1/cat['cnt_x']+1/cat['cnt_y']))) p_value = t.sf(abs(z), df=cat['cnt_x']+cat['cnt_y']-2)*2 return p_value
def t_tests_on_mean(mu_0, x_var, s, n, alpha, power=None): print("Two-Sided t-Test - H_0 : μ = {} vs H_A : μ ≠ {}".format(mu_0, mu_0)) print("with x_var {}, s {}, n {}, α {} :\n".format(x_var, s, n, alpha)) t_statistic = (x_var - mu_0) / (s / sqrt(n)) p_value = t.sf(np.abs(t_statistic), n - 1) * 2 print("t-statistic : {:.4f}, p-value : 2P(T>=|t|) = {:.3f}".format( t_statistic, p_value)) print("The null hypothesis is {}".format( "Accepted" if p_value > alpha else "Rejected")) print( "========================================================================" ) print("One-Sided t-Test - H_0 : μ <= {} vs H_A : μ > {}".format( mu_0, mu_0)) print("with x_var {}, s {}, n {}, α {} :\n".format(x_var, s, n, alpha)) p_value = t.sf(t_statistic, n - 1) print("t-statistic : {:.4f}, p-value : P(T>=t) = {:.3f}".format( t_statistic, p_value)) print("The null hypothesis is {}".format( "Accepted" if p_value > alpha else "Rejected")) print( "========================================================================" ) print("One-Sided t-Test - H_0 : μ >= {} vs H_A : μ < {}".format( mu_0, mu_0)) print("with x_var {}, s {}, n {}, α {} :\n".format(x_var, s, n, alpha)) p_value = 1 - t.sf(t_statistic, n - 1) print("t-statistic : {:.4f}, p-value : P(T<=t) = {:.3f}".format( t_statistic, p_value)) print("The null hypothesis is {}".format( "Accepted" if p_value > alpha else "Rejected")) print( "========================================================================" ) if power is not None: raise NotImplementedError print("Power >= {} requires n >= {}".format(power, 1)) print( "========================================================================" )
def welch(n1, m1, var1, n2, m2, var2, alpha=0.05): _t = (m1 - m2) / np.sqrt(var1 / n1 + var2 / n2) var1_SE, var2_SE = var1 / n1, var2 / n2 _v = (var1_SE + var2_SE)**2 / (var1_SE**2 / (n1 + 1) + var2_SE**2 / (n2 + 1)) - 2 _p = t.sf(_t, _v) return _t, _v, _p
def satterthwaite(n1, m1, var1, n2, m2, var2, alpha=0.05): _t = (m1 - m2) / np.sqrt(var1 / n1 + var2 / n2) var1_SE, var2_SE = var1 / n1, var2 / n2 _v = (var1_SE + var2_SE)**2 / (var1_SE**2 / (n1 - 1) + var2_SE**2 / (n2 - 1)) _p = t.sf(_t, _v) return _t, _v, _p
def pearsonr(self, x, y): n = len(x) if n==0: return [None,None] r = (n*self.proSum(x, y)-self.summary(x)*self.summary(y))/(((n*self.squareSum(x)-self.summary(x)**2)*(n*self.squareSum(y)-self.summary(y)**2))**0.5) t = r*(float(n-2)/(1-r**2))**0.5 p = 2*T.sf(abs(t), n-2) return [round(r,6),round(p,6)]
def full_glm_results(endog_arr, exog_vars, return_resids=False, only_tvals=False, PCA_whiten=False, ZCA_whiten=False, orthogonalize=True, orthogNear=False, orthog_GramSchmidt=False): if np.mean(exog_vars[:, 0]) != 1: print( "Warning: the intercept is not included as the first column in your exogenous variable array" ) n, num_depv = endog_arr.shape k = exog_vars.shape[1] if orthogonalize: exog_vars = sm.add_constant(orthog_columns(exog_vars[:, 1:])) elif orthogNear: exog_vars = sm.add_constant(ortho_neareast(exog_vars[:, 1:])) elif orthog_GramSchmidt: # for when order matters AKA type 2 sum of squares exog_vars = sm.add_constant(gram_schmidt_orthonorm(exog_vars[:, 1:])) else: pass invXX = np.linalg.inv(np.dot(exog_vars.T, exog_vars)) DFbetween = k - 1 # aka df model DFwithin = n - k # aka df residuals DFtotal = n - 1 if PCA_whiten: endog_arr = PCAwhiten(endog_arr) if ZCA_whiten: endog_arr = ZCAwhiten(endog_arr) a = cy_lin_lstsqr_mat(exog_vars, endog_arr) sigma2 = np.sum((endog_arr - np.dot(exog_vars, a))**2, axis=0) / (n - k) se = se_of_slope(num_depv, invXX, sigma2, k) if only_tvals: return a / se else: resids = endog_arr - np.dot(exog_vars, a) RSS = np.sum(resids**2, axis=0) TSS = np.sum((endog_arr - np.mean(endog_arr, axis=0))**2, axis=0) R2 = 1 - (RSS / TSS) std_y = np.sqrt(TSS / DFtotal) R2_adj = 1 - ((1 - R2) * DFtotal / (DFwithin)) Fvalues = ((TSS - RSS) / (DFbetween)) / (RSS / DFwithin) Tvalues = a / se Pvalues = t.sf(np.abs(Tvalues), DFtotal) * 2 if return_resids: fitted = np.dot(exog_vars, a) return (Fvalues, Tvalues, Pvalues, R2, R2_adj, np.array(resids), np.array(fitted)) else: return (Fvalues, Tvalues, Pvalues, R2, R2_adj)
def _overlap_output(self, category_names, overlap_matrix, M_annot, M_tot, print_coefficients): '''LD Score regression summary for overlapping categories.''' overlap_matrix_prop = np.zeros([self.n_annot,self.n_annot]) for i in range(self.n_annot): overlap_matrix_prop[i, :] = overlap_matrix[i, :] / M_annot prop_hsq_overlap = np.dot( overlap_matrix_prop, self.prop.T).reshape((1, self.n_annot)) prop_hsq_overlap_var = np.diag( np.dot(np.dot(overlap_matrix_prop, self.prop_cov), overlap_matrix_prop.T)) prop_hsq_overlap_se = np.sqrt( np.maximum(0, prop_hsq_overlap_var)).reshape((1, self.n_annot)) one_d_convert = lambda x: np.array(x).reshape(np.prod(x.shape)) prop_M_overlap = M_annot / M_tot enrichment = prop_hsq_overlap / prop_M_overlap enrichment_se = prop_hsq_overlap_se / prop_M_overlap overlap_matrix_diff = np.zeros([self.n_annot,self.n_annot]) for i in range(self.n_annot): if not M_tot == M_annot[0,i]: overlap_matrix_diff[i, :] = overlap_matrix[i,:]/M_annot[0,i] - \ (M_annot - overlap_matrix[i,:]) / (M_tot-M_annot[0,i]) diff_est = np.dot(overlap_matrix_diff,self.coef) diff_cov = np.dot(np.dot(overlap_matrix_diff,self.coef_cov),overlap_matrix_diff.T) diff_se = np.sqrt(np.diag(diff_cov)) diff_p = [np.nan if diff_se[i]==0 else 2*tdist.sf(abs(diff_est[i]/diff_se[i]),self.n_blocks) \ for i in range(self.n_annot)] coef_z = [] for i in range(self.n_annot): if one_d_convert(self.coef)[i]==0 and one_d_convert(self.coef_se)[i]==0: coef_z.append(0) elif one_d_convert(self.coef_se)[i]==0: coef_z.append('NA') else: coef_z.append(one_d_convert(self.coef)[i] / one_d_convert(self.coef_se)[i]) df = pd.DataFrame({ 'Category': category_names, 'Prop._SNPs': one_d_convert(prop_M_overlap), 'Prop._h2': one_d_convert(prop_hsq_overlap), 'Prop._h2_std_error': one_d_convert(prop_hsq_overlap_se), 'Enrichment': one_d_convert(enrichment), 'Enrichment_std_error': one_d_convert(enrichment_se), 'Enrichment_p':diff_p, 'Coefficient': one_d_convert(self.coef), 'Coefficient_std_error': self.coef_se, 'Coefficient_z-score': coef_z }) if print_coefficients: df = df[['Category', 'Prop._SNPs', 'Prop._h2', 'Prop._h2_std_error', 'Enrichment','Enrichment_std_error', 'Enrichment_p', 'Coefficient', 'Coefficient_std_error','Coefficient_z-score']] else: df = df[['Category', 'Prop._SNPs', 'Prop._h2', 'Prop._h2_std_error', 'Enrichment','Enrichment_std_error', 'Enrichment_p']] return df
def reducer(self, key, info): ''' Calculates all the values we will need for simple linear regression analysis, and does the analysis itself. ''' # not the most efficient, but we want to keep these values # to calculate standard errors info = list(info) # calculate sums sumx, sumy, sumxx, sumyy, sumxy, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0) for (x, y) in info: sumx += x sumy += y sumxx += x * x sumyy += y * y sumxy += x * y n += 1 # calculate correlation corr = 0 corr_denom = math.sqrt((n * sumxx - sumx**2) * (n * sumyy - sumy**2)) if corr_denom < 0.0001: yield False, "Could not calculate coefficients" corr_num = n * sumxy - sumx * sumy corr = corr_num / corr_denom if abs(corr) < 0.0001: yield False, "Could not calculate coefficients" # calculate regression coefficients beta1 = (sumxy - sumx * sumy / n) / (sumxx - sumx**2 / n) beta0 = (sumy - beta1 * sumx) / n # calculate standard errors # note: this is the reason why this isn't in a regression class y_reals = [y for (x, y) in info] y_hats = [beta0 + beta1 * y for y in y_reals] s_num = sum([(y - yhat) for (y, yhat) in zip(y_reals, y_hats)]) s = math.sqrt(s_num / (n - 2)) se_denom = n * sumxx - sumx**2 # se_beta0 = s * math.sqrt(sumxx / se_denom) se_beta1 = s * math.sqrt(n / se_denom) # calculate t-values # t0 = beta0 / se_beta0 t1 = beta1 / se_beta1 # calculate 2-sided p-values # alpha = 0.05 # t_stat = t.ppf(1 - alpha/2, n - 2) # beta0_p_value = t.sf(abs(t0), n - 2) * 2 beta1_p_value = t.sf(abs(t1), n - 2) * 2 yield None, (beta1, beta1_p_value, corr)
def ttest_1samp(self, a, popmean): mean,s=0.0,0.0 mean=sum(a)/float(len(a)) for i in a: s+=(i-mean)**2 s/=(len(a)-1) s=s**0.5 T=(mean-popmean)/(s/(len(a))**0.5) P=t.sf(abs(T),len(a)-1)*2 return[round(T,6),round(P,6)]
def significantly_different_genes( rpkm_table, experiment_groups, intergroups, target_p_value=0.05): """ Performs a test that uses the error function to determine if we can reject the hypothesis that all the genes are sampled from the same distribution. :param rpkm_table: table of the rpkm values :param experiment_groups: groups on indexes :param intergroups: the groups between which we want to do the comparisons :param target_p_value: p_value with which we want to be able to reject the null hypothesis """ groups_means = np.zeros((rpkm_table.shape[0], len(experiment_groups))) groups_var = np.zeros((rpkm_table.shape[0], len(experiment_groups))) for i, group in enumerate(experiment_groups): groups_means[:, i] = np.mean(rpkm_table[:, group], axis=1) groups_var[:, i] = np.var(rpkm_table[:, group], axis=1) / \ estimator_dilatation_table[len(group)] ** 2 group_comparison = [] for bi_group in intergroups: groups_mean_difference = np.fabs( groups_means[ :, bi_group[0]] - groups_means[ :, bi_group[1]]) groups_combined_std = np.sqrt( groups_var[ :, bi_group[0]] + groups_var[ :, bi_group[1]]) p_val = t.sf(groups_mean_difference / groups_combined_std, (len(experiment_groups[bi_group[0]]) + len(experiment_groups[bi_group[1]])) / 2) sorted_p_vals = np.sort(p_val, axis=0) lower_index = np.array(range(0, sorted_p_vals.shape[0])) *\ target_p_value / sorted_p_vals.shape[0] pre_filter_mask = sorted_p_vals <= lower_index filter_mask = pre_filter_mask if np.any(pre_filter_mask): refined_threshold = np.max(sorted_p_vals[pre_filter_mask]) filter_mask = p_val < refined_threshold group_comparison.append((p_val, filter_mask)) return group_comparison
def pVal(self): p = {} for name, sab in self.sab.items(): ssa = self.ssa[name] ssb = self.ssb[name] dof = self.dof[name] r = CorrCurves.calc(sab, ssa, ssb) df = dof - 1 t = r * np.sqrt(df/(1-r**2)) rawP = tDist.sf(np.abs(t), df) p[name] = CorrCurves.bonferroni(rawP) return p
def pairedTTest(y1, y2): y1, y2 = array(y1), array(y2) n = len(y1) y_diff = y1 - y2 y_diff_mean, yfcra_sd = mean(y_diff), std(y_diff) t = y_diff_mean / (yfcra_sd / sqrt(n)) p = spt.sf(np.abs(t), n-1) y1_mean, y1_std = mean(y1), std(y1) y1_y1z = (y1 - y1_mean) / y1_std y2_y1z = (y2 - y1_mean) / y1_std #assert mean(y1_y1z) == 0.000, "y1 mean not zero, %.5f" % mean(y1_y1z) #will be close enough to zero d = mean(y2_y1z) return (t, p, d)
def different_stdev_explicite(self, alpha, y1, y2, S1, S2, n1, n2): t0 = (y1 - y2) / (np.sqrt(S1 ** 2 / n1 + S2 ** 2 / n2)) # hypothesis testing2 df = int((S1**2/n1+S2**2/n2)**2/((S1**2/n1)**2/(n1-1)+(S2**2/n2)**2/(n2-1))) H1a = t.ppf(1 - alpha/2., df) < np.abs(t0) H1b = t.ppf(1 - alpha, df) < t0 H1c = t.ppf(alpha, df) > t0 # p-value p1a = t.sf(np.abs(t0), df) * 2 p1b = t.sf(t0, df) p1c = t.cdf(t0, df) c1 = y1 - y2 - t.ppf(1 - alpha/2., df) * np.sqrt(S1**2/n1+S2**2/n2) c2 = y1 - y2 + t.ppf(1 - alpha/2., df) * np.sqrt(S1**2/n1+S2**2/n2) CI = (c1, c2) print 'at the level of significance ', alpha, ':' print 'H1 mu1 != mu2 is ', H1a print 'H1 mu1 > mu2 is ', H1b print 'H1 mu1 < mu2 is ', H1c print 'probability of type I error for mu1 != mu2:', p1a print 'probability of type I error for mu1 > mu2:', p1b print 'probability of type I error for mu1 < mu2:', p1c print 'CI (%.1f%%) for mu1 - mu2:' %(100-100*alpha), CI, CI/y1
def __init__(self, t=None, F=None, sd=None, effect=None, df_denom=None, df_num=None): if F is not None: self.fvalue = F self.df_denom = df_denom self.df_num = df_num self.pvalue = fdist.sf(F, df_num, df_denom) else: self.tvalue = t self.sd = sd self.effect = effect self.df_denom = df_denom self.pvalue = student_t.sf(np.abs(t), df_denom)
def show_correlation_coefficient_stats(): # Get the values from the entries in the window try: correlation_coefficient = float(enter_coefficient.get()) except: correlation_coefficient = '' try: number_of_samples = int(enter_number_of_samples.get()) except: number_of_samples = '' try: tails = int(enter_tails.get()) except: tails = '' try: correlation_type = enter_correlation_type.get() except: correlation_type = '' try: level_of_significance = float(enter_level_of_significance.get()) except: level_of_significance = '' # Fix the values if tails == '': tails = 2 if correlation_type == '': correlation_type = 'pearson' if level_of_significance == '': level_of_significance = 0.05 # Return the alarm if (correlation_coefficient == '' or number_of_samples == ''): messagebox.showwarning(title="Error", message="Missing critical values!") else: ####### Calculation of the Student's t-distribution degrees_of_freedom = number_of_samples-2 if correlation_type == "pearson": t_value = correlation_coefficient * sqrt((number_of_samples-2)/(1-correlation_coefficient**2)) # Calculate the one-tail p-value p_value = t.sf(t_value, degrees_of_freedom) if tails == 1: messagebox.showinfo(title="Correlation p-value", message="The p-value for a %s correlation coefficient (of %s) computed on %s samples is: %s" %(correlation_type, correlation_coefficient, number_of_samples, p_value)) # Calculate the two-tail p-value elif tails == 2: p_value = p_value*2 messagebox.showinfo(title="Correlation p-value", message="The p-value for a %s correlation coefficient (of %s) computed on %s samples is: %s" %(correlation_type, correlation_coefficient, number_of_samples, p_value)) ###################### Significance if p_value <= level_of_significance: messagebox.showinfo(title="Significance", message="The calculated correlation coefficient IS statistically significant at a level of significance of %s" %(level_of_significance)) else: messagebox.showinfo(title="Significance", message="The calculated correlation coefficient is NOT statistically significant at a level of significance of %s" %(level_of_significance))
def _overlap_output(self, category_names, overlap_matrix, M_annot, M_tot, print_coefficients): '''LD Score regression summary for overlapping categories.''' overlap_matrix_prop = np.zeros([self.n_annot,self.n_annot]) for i in range(self.n_annot): overlap_matrix_prop[i, :] = overlap_matrix[i, :] / M_annot prop_hsq_overlap = np.dot( overlap_matrix_prop, self.prop.T).reshape((1, self.n_annot)) prop_hsq_overlap_var = np.diag( np.dot(np.dot(overlap_matrix_prop, self.prop_cov), overlap_matrix_prop.T)) prop_hsq_overlap_se = np.sqrt( np.maximum(0, prop_hsq_overlap_var)).reshape((1, self.n_annot)) one_d_convert = lambda x: np.array(x).reshape(np.prod(x.shape)) prop_M_overlap = M_annot / M_tot enrichment = prop_hsq_overlap / prop_M_overlap enrichment_se = prop_hsq_overlap_se / prop_M_overlap overlap_matrix_diff = np.zeros([self.n_annot,self.n_annot]) for i in range(self.n_annot): if not M_tot == M_annot[0,i]: overlap_matrix_diff[i, :] = overlap_matrix[i,:]/M_annot[0,i] - \ (M_annot - overlap_matrix[i,:]) / (M_tot-M_annot[0,i]) diff_est = np.dot(overlap_matrix_diff,self.coef) diff_cov = np.dot(np.dot(overlap_matrix_diff,self.coef_cov),overlap_matrix_diff.T) diff_se = np.sqrt(np.diag(diff_cov)) diff_p = ['NA' if diff_se[i]==0 else 2*tdist.sf(abs(diff_est[i]/diff_se[i]),self.n_blocks) \ for i in range(self.n_annot)] df = pd.DataFrame({ 'Category': category_names, 'Prop._SNPs': one_d_convert(prop_M_overlap), 'Prop._h2': one_d_convert(prop_hsq_overlap), 'Prop._h2_std_error': one_d_convert(prop_hsq_overlap_se), 'Enrichment': one_d_convert(enrichment), 'Enrichment_std_error': one_d_convert(enrichment_se), 'Enrichment_p':diff_p, 'Coefficient': one_d_convert(self.coef), 'Coefficient_std_error': self.coef_se, 'Coefficient_z-score': one_d_convert(self.coef) / one_d_convert(self.coef_se) }) if print_coefficients: df = df[['Category', 'Prop._SNPs', 'Prop._h2', 'Prop._h2_std_error', 'Enrichment','Enrichment_std_error', 'Enrichment_p', 'Coefficient', 'Coefficient_std_error','Coefficient_z-score']] else: df = df[['Category', 'Prop._SNPs', 'Prop._h2', 'Prop._h2_std_error', 'Enrichment','Enrichment_std_error', 'Enrichment_p']] return df
def pers(x, y): assert len(x) == len(y) x_bar = mean(x) y_bar = mean(y) s_x = std(x, ddof=1) s_y = std(y, ddof=1) tmp = 0.0 for i in range(0, len(x)): tmp += (x[i] - x_bar) * (y[i] - y_bar) r = tmp / (len(x) - 1) / s_x / s_y if r == 1: return [1, 0] tt = r * sqrt((len(x) - 2) / (1 - r ** 2)) p = t.sf(abs(tt), len(x) - 2) * 2 return [r, p]
def pt(x, df=1, loc=0, scale=1, ncp=None, lowertail=True, log=False): """ The cumulative distribution function for the t distribution. You provide a value along the t distribution (eg x=3) or array of values, and it returns what proportion of values lie below it (the quantile) Alternatively, if you select lowertail=False, it returns the proportion of values that are above it. ARGS: --------------- :param x (float, array of floats): The values along the distribution. :param df (float): degrees of freedom :param loc: array_like, optional location parameter (default=0) :param scale: float, optional scale (default=1) :param ncp (float): non-centrality parameter delta. Currently not implemented. :param lowertail (bool): are you interested in what proportion of values lie beneath x? or above x (False)? :param log (boolean): Use log scale? RETURN: --------------- :return: an array of quantiles() corresponding to the values in x """ if lowertail and not log: return t.cdf(x, df=df, loc=loc, scale=scale) elif not lowertail and not log: return t.sf(x, df=df, loc=loc, scale=scale) elif lowertail and log: return t.logcdf(x, df=df, loc=loc, scale=scale) else: return t.logsf(x, df=df, loc=loc, scale=scale)
def approximate_MH_accept(mu_0,log_lik,X,batch_size,epsilon,theta_prime, theta_t,N): iteration_number=0 while True: iteration_number +=1 n = iteration_number*batch_size n = min(n, N) sub = np.random.choice(X, n,replace=False) sub = log_lik(sub, theta_prime) - log_lik(sub, theta_t) l_hat = np.mean(sub) l_2_hat = np.mean(sub**2) s_l = np.sqrt(l_2_hat - l_hat**2*n/(n-1)) s = s_l/ np.sqrt(n)*np.sqrt(1 - (n-1)/(N-1)) t_students_var = (l_hat - mu_0) / s stat = np.abs(t_students_var) delta = t.sf(stat, n-1) if delta < epsilon: if l_hat > mu_0: return True,n return False,n
def test_pvalue(self): assert_almost_equal(self.Ttest.pvalue, student_t.sf( np.abs(self.res1.tvalues), self.res1.model.df_resid)*2, DECIMAL_4)
def pvalues(self): #TODO: same for conditional and unconditional? df_resid = self.df_resid return t.sf(np.abs(self.tvalues), df_resid) * 2
def pvalues(self): return t.sf(np.abs(self.tvalues), self.df_resid)*2
subjs_fname = "/Users/sudregp/data/meg/good_subjects.txt" group_fname = "/Users/sudregp/data/meg/%s_subjs.txt" % group data_dir = "/Users/sudregp/data/results/meg/" fid = open(subjs_fname, "r") subjs = [line.rstrip() for line in fid] fid.close() fid = open(group_fname, "r") this_group = [line.rstrip() for line in fid] fid.close() # load the pre-computed correlation data fname = data_dir + "corrs-seed%d-%dto%d-lh.stc" % (seed_src, band[0], band[1]) stc = mne.read_source_estimate(fname) y = [s in this_group for s in subjs] y = np.asarray(y).T X = np.mean(stc.data[:, y], axis=1) print "Subjects in %s: %d" % (group, np.sum(y)) if fdr > 0: n = sum(y) # from http://www.danielsoper.com/statcalc3/calc.aspx?id=44 tstat = X / np.sqrt((1 - X ** 2) / (n - 2)) # t.sf gives the one tailed version pval = t.sf(tstat, n - 1) * 2 reject_fdr, pval_fdr = mne.stats.fdr_correction(pval, alpha=fdr, method="indep") X[~reject_fdr] = 0 stc2 = mne.SourceEstimate(X[:, None], vertices=stc.vertno, tmin=0, tstep=0, subject="fsaverage") brain = stc2.plot(hemi="both", fmin=min(X), fmid=(min(X) + (max(X) - min(X)) / 2), fmax=max(X))
def pval(x, standard_error, df=800, tail=2): pval = t.sf(np.abs((x-0)/standard_error), df) * tail return pval
def _p_value_raw(self): """Returns the raw p values.""" from scipy.stats import t return 2 * t.sf(np.fabs(self._t_stat_raw), self._df_resid_raw)
def glm(x,y,w=1.0): p,n = shape(x) # sample size p += 1 # add one for intercept dof = n - p # degrees of freedom sig = var(y) # variance mu = (y + mean(y))/2.0 # initial mean estimate eta = log(mu) # initial predictor X = vstack((ones(n), x)).T # observed x-variable matrix # Newton-Raphson : converged = False rtol = 1e-12 dtol = 1e-12 lmbda = 1.0 nIter = 0 deviance = 1 D = 1 ahat = zeros(p) # initial parameters rel_res = zeros(p) # initial relative residual maxIter = 100 rel_a = [] dev_a = [] while not converged and nIter < maxIter: W = diags(w*mu**2/sig, 0) # compute weights z = eta + (y - mu)/mu # adjusted dependent variable WX = W.dot(X) XTWX = dot(X.T, WX) iXTWX = inv(XTWX) Wz = W.dot(z) ahat_n = dot(iXTWX, dot(X.T, Wz)) eta = dot(X, ahat_n) # compute estimates mu = exp(eta) # linear predictor # calculate residual : rel_res = norm(ahat - ahat_n, inf) rel_a.append(rel_res) ahat = ahat_n D_n = sum((y - mu)**2) deviance = abs(D_n - D) D = D_n dev_a.append(deviance) if rel_res < rtol or deviance < dtol: converged = True nIter += 1 string = "Newton iteration %d: d (abs) = %.2e, (tol = %.2e) r (rel) = %.2e (tol = %.2e)" print string % (nIter, deviance, dtol, rel_res, rtol) # calculate statistics : varA = diag(iXTWX) # variance of alpha hat sea = sqrt(varA) # vector of standard errors for alpha hat t_a = ahat / sea pval = t.sf(abs(t_a), dof) * 2 conf = 0.95 # 95% confidence interval tbonf = t.ppf((1 - conf/p), dof) # bonferroni corrected t-value ci = tbonf*sea # confidence interval for ahat resid = (y - mu) # 'working' residual RSS = sum((y - mu)**2) # residual sum of squares TSS = sum((y - mean(y))**2) # total sum of squares R2 = (TSS-RSS)/TSS # R2 F = (TSS-RSS)/(p-1) * (n-p)/RSS # F-statistic F_p = fdtrc(p-1, dof, F) # F-Stat. p-value # log-likelihood : L = sum((y*mu - mu**2/2)/(2*sig) - y**2/(2*sig) - 0.5*log(2*pi*sig)) AIC = (-2*L + 2*p)/n # AIC statistic # estimated error variance : sighat = 1/(n-p) * RSS vara = { 'ahat' : ahat, 'yhat' : mu, 'sea' : sea, 'ci' : ci, 'dof' : dof, 'resid' : resid, 'rel_a' : rel_a, 'dev_a' : dev_a, 'R2' : R2, 'F' : F, 'AIC' : AIC, 'sighat': sighat} return vara
fc_fmri_ctl_a= fc_fmri_ctl_var / 9.0 fc_fmri_dms_a= fc_fmri_dms_var / 9.0 # (3) Add results obtained for CTL and DMS in step (2) together: fc_syn_a = fc_syn_ctl_a + fc_syn_dms_a fc_fmri_a= fc_fmri_ctl_a+ fc_fmri_dms_a # (4) Take the square root the results in step (3): sqrt_fc_syn_a = np.sqrt(fc_syn_a) sqrt_fc_fmri_a= np.sqrt(fc_fmri_a) # (5) Divide the results of step (1) by the results of step (4) to obtain 't': fc_syn_t = fc_syn_mean_diff / sqrt_fc_syn_a fc_fmri_t= fc_fmri_mean_diff / sqrt_fc_fmri_a # (6) Calculate the degrees of freedom (add up number of observations for each group # minus number of groups): dof = 10 + 10 - 2 # (7) find the p-values for the above 't' and 'degrees of freedom': fc_syn_p_values = t.sf(fc_syn_t, dof) fc_fmri_p_values = t.sf(fc_fmri_t, dof) print 't-values for synaptic activity correlations: ', fc_syn_t print 't-values for fmri time-series correlations: ', fc_fmri_t # convert to Pandas dataframe, using the transpose to convert to a format where the names # of the modules are the labels for each time-series fc_mean = pd.DataFrame(np.array([fc_syn_dms_mean, fc_syn_ctl_mean, fc_fmri_dms_mean, fc_fmri_ctl_mean]), columns=np.array(['V1', 'V4', 'FS', 'D1', 'D2', 'FR', 'LIT']), index=np.array(['DMS-syn', 'CTL-syn', 'DMS-fmri', 'CTL-fmri'])) #fc_std = pd.DataFrame(np.array([fc_syn_dms_std, fc_syn_ctl_std, # fc_fmri_dms_std, fc_fmri_ctl_std]), # columns=np.array(['V1', 'V4', 'D1', 'D2', 'FS', 'FR']), # index=np.array(['DMS-syn', 'CTL-syn', 'DMS-fmri', 'CTL-fmri']))