def cronbach_alpha(data=None, items=None, scores=None, subject=None, nan_policy='pairwise', ci=.95): #safety check assert isinstance(data, pd.DataFrame), 'data must be a dataframe.' assert nan_policy in ['pairwise', 'listwise'] if all([v is not None for v in [items, scores, subject]]): # Data in long-format: we first convert to a wide format data = data.pivot(index=subject, values=scores, columns=items) # From now we assume that data is in wide format n, k = data.shape assert k >= 2, 'At least two items are required.' assert n >= 2, 'At east two raters/subjects are required.' err = 'All columns must be numeric.' assert all([data[c].dtype.kind in 'bfi' for c in data.columns]), err if data.isna().any().any() and nan_policy == 'listwise': # In R = psych:alpha(data, use="complete.obs") data = data.dropna(axis=0, how='any') # Compute covariance matrix and Cronbach's alpha C = data.cov() cronbach = (k / (k - 1)) * (1-np.trace(C) / C.sum().sum()) # which is equivalent to # v = np.diag(C).mean() # c = C.values[np.tril_indices_from(C, k=-1)].mean() # cronbach = (k * c) / (v + (k - 1) * c) # Confidence intervals alpha = 1 - ci df1 = n - 1 df2 = df1 * (k - 1) lower = 1 - (1 - cronbach) * f.isf(alpha / 2, df1, df2) upper = 1 - (1 - cronbach) * f.isf(1 - alpha / 2, df1, df2) return round(cronbach, 6), np.round([lower, upper], 3)
def ROC_CI(N, Vec_theta, alpha=0.05): """ One-Dimensional Confidence-Interval Calculations Parameters ---------- N Vec_theta alpha Returns ------- theta_L theta_U """ theta_L = np.zeros(Vec_theta.size) theta_U = np.zeros(Vec_theta.size) for i, theta in enumerate(Vec_theta): if theta != 0: alpha_2 = alpha / 2 else: alpha_2 = alpha if N > 100 and theta > 0.1: d = N - 1 sigma = sqrt(theta * (1 - theta)) if theta == 0: theta_L[i] = 0 else: theta_L[i] = theta - t.isf(alpha_2, df=d) * sigma / sqrt(N) theta_U[i] = theta + t.isf(alpha_2, df=d) * sigma / sqrt(N) elif N > 100 and theta < 0.1: if theta == 0: theta_L[i] = 0 else: d_L = 2 * N * theta theta_L[i] = chi2.isf(1 - alpha_2, df=d_L) / (2 * N) d_U = 2 * (N * theta + 1) theta_U[i] = chi2.isf(alpha_2, df=d_U) / (2 * N) else: d1L = N - N * theta + 1 d2L = N * theta if theta == 0: theta_L[i] = 0 else: theta_L[i] = d2L / (d2L + d1L * f.isf(alpha_2, 2 * d1L, 2 * d2L)) d1U = N * theta + 1 d2U = N - N * theta theta_U[i] = d1U * f.isf(alpha_2, 2 * d1U, 2 * d2U) / ( d2U + d1U * f.isf(alpha_2, 2 * d1U, 2 * d2U)) # ensure increase for i in range(Vec_theta.size - 1): if theta_L[i + 1] < theta_L[i]: theta_L[i + 1] = theta_L[i] if theta_U[i + 1] < theta_U[i]: theta_U[i + 1] = theta_U[i] return theta_L, theta_U
def ftest(fvalue, df1, df2, p=0.05): '''双侧F检验''' fl = f.isf(1 - p / 2, dfn=df1, dfd=df2) fr = f.isf(p / 2, dfn=df1, dfd=df2) if fvalue < fl or fvalue > fr: return 0 else: return 1
def Cochran(): d1 = disp(1, ylist[0]) d2 = disp(2, ylist[1]) d3 = disp(3, ylist[2]) d4 = disp(4, ylist[3]) groz = round(max(ydisp) / sum(ydisp), 2) partresult = q / (f2 - 1) params = [partresult, f1, (f2 - 2) * f1] fisher = f.isf(*params) result = fisher / (fisher + (f2 - 2)) gkr = round(Decimal(result).quantize(Decimal('.0001')).__float__(), 2) print("\n2)Критерій Кохрана:\n\n Знайдемо дисперсії по рядках:") print(" ", d1, "\n ", d2, "\n ", d3, "\n ", d4, "\n") print(" Dmax{{yi}} = {0}\n Gp = {0}/({1}+{2}+{3}+{4}) = {5}".format( max(ydisp), *ydisp, groz)) print(" f1 = {0} - 1 = {1}, f2 = 4, q = {3}\n За таблицею Gкр = {2}". format(m, f1, gkr, q)) if groz < gkr: print( " Gp < Gкр => За критерієм Кохрана дисперсія однорідна з ймовірністю", p) else: print( " Gp > Gкр => За критерієм Кохрана дисперсія неоднорідна з ймовірністю", p)
def Fisher(): sad = round( m * ((y1 - ymed[0])**2 + (y2 - ymed[1])**2 + (y3 - ymed[2])**2 + (y4 - ymed[3])**2) / (4 - d), 2) froz = round(sad / disp, 2) f4 = N - d fkr = Decimal(abs(f.isf(q, f4, f3))).quantize(Decimal('.0001')).__float__() print("\n3)Критерій Фішера:\n") print(" f4 = {2} - {0} = {1}".format(d, f4, N)) print( " {0}*(({5} - {1})**2 + ({6} - {2})**2 + ({7} - {2})**2 + ({8} - {2})**2)/(4-{10}) = {9}" .format(m, *ymed, y1, y2, y3, y4, sad, d)) print(" Fр = {0}/{1} = {2}".format(sad, disp, froz)) print(" За таблицею Fкр =", fkr) if fkr > froz: print( " За критерієм Фішера рівняння регресії адекватне оригіналу з ймовірністю", p) else: print( " За критерієм Фішера рівняння регресії неадекватне оригіналу з ймовірністю", p)
def Ftest(self, alpha=0.05): """ f统计量计算公式:f = [ESS/(k-1)]/[RSS/(N-k)] ESS(exlplained sum of suqres),可解释平方和,回归平方和 RSS(residual sum of squares),残差平方和 k, 回归系数的个数(实际上就是自变量的个数+1);N,训练样本的个数 """ self.y_hat = np.mean(self.y) print("输入是", self.x) self.y_bar = self.predict(self.x) ESS = np.sum([(self.y_bar[i] - self.y_hat)**2 for i in range(self.N)]) RSS = np.sum([(self.y_bar[i] - self.y[i])**2 for i in range(self.N)]) print("f分布的自由度是", self.k - 1, self.N - self.k) fValue = (ESS / (self.k - 1)) / (RSS / (self.N - self.k)) #要求是多元线性回归模型,样本数量大于参数个数 f_alpha = ff.isf(alpha, self.k - 1, self.N - self.k) print(fValue, f_alpha) if fValue > f_alpha: print("全部参数全为0的情况下,出现f统计量取值为", fValue, '的概率小于等于', alpha, "说明全部参数不为0") print("模型通过了f检验") return True else: return False
def cochrane(selectionSize, qty_of_selections, significance): selectionSize += 1 partResult1 = significance / (selectionSize - 1) params = [partResult1, qty_of_selections, (selectionSize - 1 - 1) * qty_of_selections] fischer = f.isf(*params) result = fischer / (fischer + (selectionSize - 1 - 1)) return Decimal(result).quantize(Decimal('.0001')).__float__()
def cv_mse_F(self, Y_predict_all, y, alpha, num_tr): press = np.square(np.subtract(Y_predict_all, y)) PRESS_all = np.sum(press, axis=0) RMSECV_array = np.sqrt(PRESS_all / self.n) # print RMSECV_array min_RMSECV = min(RMSECV_array) comp_array = RMSECV_array.argsort() # print comp_array comp_best = comp_array[0] + 1 # print comp_best k_press = PRESS_all[:comp_best] # print k_press min_press = PRESS_all[comp_best - 1] # print min_press F_h = k_press / min_press # print F_h F_value = f.isf(alpha, num_tr, num_tr) F_bias = np.subtract(F_h, F_value) # print F_bias min_comp = [k for k in range(len(F_bias)) if F_bias[k] < 0] # if (min_comp==0): # min_comp=1 min_comp_best = min_comp[0] if (min_comp_best == 0): min_comp_best = 1 min_RMSECV = RMSECV_array[min_comp_best - 1] return RMSECV_array, min_RMSECV, min_comp_best
def homoskedacity_test(self): result = True self.ltw('\\subsubsection{Homoskedastyczność}\n') n1 = self.n // 2 n2 = self.n - n1 r1 = n1 - (self.k + 1) r2 = n2 - (self.k + 1) f_alpha_r1r2 = f.isf(q=self.alpha, dfn=r1, dfd=r2) resid1 = self.residuals[:n1] resid2 = self.residuals[n1:] self.log(self.residuals, resid1, resid2) s12 = np.dot(resid1, resid1) / r1 s22 = np.dot(resid2, resid2) / r2 f_stat = s12 / s22 self.log(f_stat, f_alpha_r1r2) self.ltw(f'\\[F = {f_stat}\\]\n') self.ltw(f'\\[F_{{{self.alpha}, {r1}, {r2}}} = {f_alpha_r1r2}\\]\n') if f_alpha_r1r2 < abs(f_stat): result = False self.log("Wystepuje heteroskedastycznosc") self.ltw('W modelu występuje heteroskedastyczność.\n') else: self.ltw('Model jest homoskedastyczny.\n') return result
def get_cochran_value(f1, f2, q): partResult1 = q / f2 params = [partResult1, f1, (f2 - 1) * f1] fisher = f.isf(*params) result = fisher / (fisher + (f2 - 1)) return Decimal(result).quantize(Decimal('.0001')).__float__()
def get_fisher_value(f3, f4, q): return Decimal(abs(f.isf(q, f4, f3))).quantize(Decimal('.0001')).__float__() f3 = (m - 1) * N f4 = N - d q = 0.05 theoretical_y = numpy.array([regression_equation(row[0], row[1], row[2], b_coefficients) for row in x_table]) average_y = numpy.array(list(map(lambda el: numpy.average(el), y_table))) s_ad = m / (N - d) * sum((theoretical_y - average_y) ** 2) y_variations = numpy.array(list(map(numpy.var, y_table))) s_v = numpy.average(y_variations) f_p = float(s_ad / s_v) f_t = get_fisher_value(f3, f4, q) theoretical_values_to_print = list( zip(map(lambda x: "x1 = {0[1]:<10} x2 = {0[2]:<10} x3 = {0[3]:<10}".format(x), x_table), theoretical_y)) print("\nПеревірка адекватності моделі за критерієм Фішера: m = {}, N = {} для таблиці y_table".format(m, N)) print("Теоретичні значення y для різних комбінацій факторів:") print("\n".join(["{arr[0]}: y = {arr[1]}".format(arr=el) for el in theoretical_values_to_print])) print("Fp = {}, Ft = {}".format(f_p, f_t)) print("Fp < Ft => модель адекватна" if f_p < f_t else "Fp > Ft => модель неадекватна") return True if f_p < f_t else False
def cochran(self, N, m): for i in range(N): ydisp = 0 for k in range(m): ydisp += (self.ylist[i][k] - self.ymed[i])**2 ydisp /= m self.ydisplist.append(round(ydisp, self.round)) self.groz = round( max(self.ydisplist) / sum(self.ydisplist), self.round) f1 = m - 1 f2 = N partresult = self.q / f2 params = [partresult, f1, (f2 - 1) * f1] fisher = f.isf(*params) result = fisher / (fisher + (f2 - 1)) self.gkr = round( Decimal(result).quantize(Decimal('.0001')).__float__(), self.round) self.f1 = f1 self.f2 = f2 Task.printcoch(N) if self.groz < self.gkr: print( " Gp < Gкр => За критерієм Кохрана дисперсія однорідна з ймовірністю", p) else: print( " Gp > Gкр => За критерієм Кохрана дисперсія неоднорідна з ймовірністю", p) print(" Збільшуємо m на 1: m = {1}+1 = {0}".format(m + 1, m)) m += 1 Task.equation(N, m) Task.cochran(N, m)
def cohren_value(size_of_selections, qty_of_selections, significance): size_of_selections += 1 partResult1 = significance / (size_of_selections - 1) params = [partResult1, qty_of_selections, (size_of_selections - 1 - 1) * qty_of_selections] fisher = f.isf(*params) result = fisher / (fisher + (size_of_selections - 1 - 1)) return Decimal(result).quantize(Decimal('.0001')).__float__()
def FTest(self,alpha): # F-检验 yHat=self.predict(self.X) Qe=((self.Y-yHat)**2).sum(axis=0) yAver=np.mean(self.Y,axis=0) U=((yHat-yAver)**2).sum(axis=0) Fvalue=(U/self.k_x)/(Qe/(self.n_x-self.k_x-1)) Falpha=f.isf(alpha,1,self.n_x-self.k_x-1) return Fvalue,Falpha,Fvalue>Falpha
def cohren(disper): global Gp, Gt, f1, f2 Gp = max(disper) / sum(disper) f1 = m - 1 f2 = N fisher = f.isf(*[q / f2, f1, (f2 - 1) * f1]) Gt = round(fisher / (fisher + (f2 - 1)), 4) return Gp < Gt
def on_pushButton_f_quantile_clicked(self): """ Slot documentation goes here. """ f_n = self.doubleSpinBox_f_n.value() f_m = self.doubleSpinBox_f_m.value() arfa = self.doubleSpinBox_f_arfa.value() quantile = f.isf(arfa, f_m, f_n) self.lineEdit_f_quantile.setText('%.3f' % quantile)
def region_of_rejection(SA, SE, s, n, alpha): test_value = (SA / (s - 1)) / (SE / (n - s)) F_value = f.isf(alpha, s - 1, n - s) if test_value >= F_value: reject_ind = True else: reject_ind = False return test_value, F_value, reject_ind
def Ftest(self, alpha): y_hat = self.predict(self.X) Qe = np.sum((self.Y - y_hat)**2) ymean = self.Y.mean() U = np.sum((y_hat - ymean)**2) n = len(self.X) F = U / (Qe / (n - 2)) F_alpha = f.isf(alpha, 1, n - 2) return [F, F_alpha, F > F_alpha]
def kohren_check(disper): global Gp, Gt, f1, f2 print("Критерій Кохрена") Gp = max(disper) / sum(disper) f1 = m - 1 f2 = N fisher = f.isf(*[q / f2, f1, (f2 - 1) * f1]) Gt = round(fisher / (fisher + (f2 - 1)), 4) print("Gp = " + str(Gp) + ", Gt = " + str(Gt))
def cohrenValue(selectionSize, selectionQty, significance): selectionSize += 1 partResult1 = significance / (selectionSize - 1) params = [ partResult1, selectionQty, (selectionSize - 1 - 1) * selectionQty ] fisher = f.isf(*params) result = fisher / (fisher + (selectionSize - 1 - 1)) return Decimal(result).quantize(Decimal('.0001')).__float__()
def get_cohren_value(self): size_of_selections = self.N + 1 qty_of_selections = self.m - 1 significance = self.q partResult1 = significance / (size_of_selections - 1) params = [partResult1, qty_of_selections, (size_of_selections - 1 - 1) * qty_of_selections] fisher = f.isf(*params) result = fisher / (fisher + (size_of_selections - 1 - 1)) return Decimal(result).quantize(Decimal('.0001')).__float__()
def get_cohren_value(size_of_selections, qty_of_selections, significance): from _pydecimal import Decimal from scipy.stats import f size_of_selections += 1 partResult1 = significance / (size_of_selections - 1) params = [partResult1, qty_of_selections, (size_of_selections - 1 - 1) * qty_of_selections] fisher = f.isf(*params) result = fisher / (fisher + (size_of_selections - 1 - 1)) return Decimal(result).quantize(Decimal('.0001')).__float__()
def dModXPlot(self): dmx = self.dModX() nr, nc = self.x_mat.shape ncomp = self.scores.shape[1] A0 = 0 if type(self.x_mean) == int else 1 fc = np.sqrt(f.isf(0.05, nr - ncomp - A0, nc - ncomp)) ax = pd.Series(dmx, index=self.x_df.index.get_level_values(0)).plot(kind='bar', color='green') ax.hlines(fc, -1, 20) return ax.figure
def get_cohren_critical(self) -> float: ''' Get table value of Cohren criterion Returns: float -- [criterion value] ''' f_crit = f.isf((1 - self.p) / self.f2, self.f1, (self.f2 - 1) * self.f1) return f_crit / (f_crit + self.f2 - 1)
def fisher_check(): global Fp, Ft f4 = N - d sad = 0 for i in range(N): sad += (f_x[i] - y_mean[i])**2 sad *= (m / (N - d)) Fp = sad / sb print(f"\n\nFp = {Fp:}", end="\t\t\t") Ft = round(abs(f.isf(q, f4, f3)), 4) print(f"Ft = {Ft:}")
def on_pushButton_f_quantile_plot_clicked(self): """ Slot documentation goes here. """ f_n = self.doubleSpinBox_f_n.value() f_m = self.doubleSpinBox_f_m.value() arfa = self.doubleSpinBox_f_arfa.value() quantile = f.isf(arfa, f_m, f_n) if self.radioButton_one.isChecked(): self.widget.mpl.axes.cla() self.widget.mpl.start_f_plot(f_m, f_n) self.widget.mpl.fill_f_plot(f_m, f_n, quantile, self.a, self.b, arfa)
def fisher_check(): global Fp, Ft print("\nКритерій Фішера") f4 = N - d sad = 0 for i in range(N): sad += (f_x[i] - y_mean[i])**2 sad *= (m / (N - d)) Fp = sad / sb print(f"Fp = {Fp:}") print(f"Кількість степенів свободи: F4 = N - d = {f4:}") Ft = round(abs(f.isf(q, f4, f3)), 4) print(f"Табличне значення коефіцієнту Фішера: Ft = {Ft:}")
def get_cohren_value(size_of_selections, qty_of_selections, significance): # qty_of_selections = 4 # size_of_selections = 4 size_of_selections += 1 # significance = 0.05 partResult1 = significance / (size_of_selections - 1) params = [partResult1, qty_of_selections, (size_of_selections - 1 - 1) * qty_of_selections] fisher = f.isf(*params) # print(fisher) # fisher = 0 result = fisher / (fisher + (size_of_selections - 1 - 1)) return Decimal(result).quantize(Decimal('.0001')).__float__()
def kohren_check(disper): global Gp, Gt, f1, f2 print("Критерій Кохрена") Gp = max(disper) / sum(disper) f1 = m - 1 f2 = N fisher = f.isf(*[q / f2, f1, (f2 - 1) * f1]) Gt = round(fisher / (fisher + (f2 - 1)), 4) print( f"Gp = {Gp:}\nКількість степенів свободи: F1 = m - 1 = {f1:}; F2 = N = {f2:}" ) print( f"Рівень значимості: q = 1 - p = {q:}\nТабличне значення коефіцієнту Кохрена: Gt = {Gt:}" )
def region_of_rejection(SA, SB, SAXB, SE, r, s, t, alpha): if t > 1: # # repeated test # test_FA = (SA / (r - 1)) / (SE / (r * s * (t - 1))) F_A = f.isf(alpha, r - 1, r * s * (t - 1)) test_FB = (SB / (s - 1)) / (SE / (r * s * (t - 1))) F_B = f.isf(alpha, r - 1, r * s * (t - 1)) test_FAXB = (SAXB / ((r - 1) * (s - 1))) / (SE / (r * s * (t - 1))) F_AXB = f.isf(alpha, (r - 1) * (s - 1), r * s * (t - 1)) elif t == 1: # # single test # test_FA = (SA / (r - 1)) / ((SE / ((r - 1) * (s - 1)))) F_A = f.isf(alpha, r - 1, (r - 1) * (s - 1)) test_FB = (SB / (s - 1)) / ((SE / ((r - 1) * (s - 1)))) F_B = f.isf(alpha, s - 1, (r - 1) * (s - 1)) test_FAXB = None F_AXB = None else: test_FA = None F_A = None test_FB = None F_B = None test_FAXB = None F_AXB = None return test_FA, F_A, determine_rejection(test_FA, F_A), \ test_FB, F_B, determine_rejection(test_FB, F_B), \ test_FAXB, F_AXB, determine_rejection(test_FAXB, F_AXB)
#Decision process component def z_cmp(calculated_z_score_proportion, criticalz_percentage_proportion): """ calculated_z_score_proportion: the z score calculated from mu and xbar in proportion criticalz_percentage_proportion: Given Critical Value proportion for acceptance criteria if calculated_z_score_proportion > criticalz_percentage_proportion then the chance of that happening is p < criticalz_percentage_proportion """ return abs(calculated_z_score_proportion) > scipy.stats.norm.cdf(scipy.stats.norm.ppf((100-criticalz_percentage_proportion)/100.)) ######## ANALYSIS OF VARIANCE - ANOVA ########### """ F = Between Group Variability/Within Group Variability """ f_critical = lambda alpha, dfb, dfw: round(f.isf(alpha, dfb, dfw), 4) #between group var #xbar_groups is an array of x means df_ssb = lambda xbar_groups: float(len(xbar_groups) - 1) ssb = lambda xg, xbar_groups, each_grp_sample_size: \ sum([ni * (xbar - xg)**2 for ni, xbar in izip(each_grp_sample_size, xbar_groups)])/df_ssb(xbar_groups) #within group var #x_samples is an array of lists - this is like df for pooled variance df_ssw = lambda x_samples: float(reduce(lambda x,y: x+y, imap(len, x_samples)) - len(x_samples)) #this is like pooled variance for more than 2 samples - generic ssw = lambda x_samples: sum([var(samples) * len(samples) for samples in x_samples])/df_ssw(x_samples) #F ratio stats
def dinucleotide_usage(self, seqs, genome_id): """Calculate dinucleotide (n3:n1) usage statistics for genome. Parameters ---------- seqs : dict[seq_id] -> seq Sequences indexed by sequence id. genome_id : str Unique id of genome used to create output file. """ # calculate dinucleotide usage for each gene and the genome as a while gene_di_usage = defaultdict(lambda: defaultdict(int)) gene_n1 = defaultdict(lambda: defaultdict(int)) gene_n3 = defaultdict(lambda: defaultdict(int)) genome_di_usage = defaultdict(int) genome_n1 = defaultdict(int) genome_n3 = defaultdict(int) gc = {} for gene_id, seq in seqs.items(): gc[gene_id] = seq_tk.gc(seq) for i in xrange(2, len(seq) - 2, 3): dinucleotide = seq[i:i + 2].upper() if 'N' not in dinucleotide: gene_di_usage[gene_id][dinucleotide] += 1 gene_n3[gene_id][dinucleotide[0]] += 1 gene_n1[gene_id][dinucleotide[1]] += 1 genome_di_usage[dinucleotide] += 1 genome_n3[dinucleotide[0]] += 1 genome_n1[dinucleotide[1]] += 1 # calculate Manhattan distance for each gene manhattan_dist = self._manhattan(gene_di_usage, genome_di_usage) # identify deviant genes under Hotelling T-squared statistic t2_stats = self._hotelling_statistic(gene_di_usage, gene_n3, gene_n1, genome_di_usage, genome_n3, genome_n1) # correction from T-squared distribution to F-distribution approximation # http://en.wikipedia.org/wiki/Hotelling%27s_T-squared_distribution num_genes = len(gene_di_usage) f_dist_correction = float(num_genes - 15 + 1) / (15 * num_genes) deviant_threshold = f.isf(self.critical_value, 15, num_genes - 15 + 1) / f_dist_correction # report dinucleotide usage of each gene di_set_sorted = sorted(genome_di_usage.keys()) gene_ids_sorted = sorted(t2_stats.items(), key=operator.itemgetter(1), reverse=True) output_file = os.path.join(self.output_dir, genome_id + '.di_usage.tsv') fout = open(output_file, 'w') fout.write('Gene Id\tGC\tLength (bp)\t# dinucleotides\tHotelling T-squared statistic\tDeviant\tManhattan distance\tDeviations from mean') for di in di_set_sorted: fout.write('\t' + di) fout.write('\n') genome_gc = seq_tk.gc(''.join(seqs.values())) genome_sum_di = sum(genome_di_usage.values()) fout.write('%s\t%.2f\t%d\t%d' % ('<complete genome>', genome_gc * 100.0, sum([len(x) for x in seqs.values()]), genome_sum_di)) fout.write('\t%s\t%s\t%.1f\t%.1f' % ('na', 'na', 0, 0)) for di in di_set_sorted: fout.write('\t%.2f' % (genome_di_usage.get(di, 0) * 100.0 / genome_sum_di)) fout.write('\n') for gene_id, t2_stat in gene_ids_sorted: dinucleotides = gene_di_usage[gene_id] sum_di = sum(dinucleotides.values()) fout.write('%s\t%.2f\t%d\t%d' % (gene_id, gc[gene_id] * 100, len(seqs[gene_id]), sum_di)) fout.write('\t%.2f\t%s' % (t2_stat, 'yes' if t2_stat > deviant_threshold else 'no')) fout.write('\t%.2f\t%.2f' % (manhattan_dist[gene_id][0], manhattan_dist[gene_id][1])) for di in di_set_sorted: fout.write('\t%.2f' % (dinucleotides.get(di, 0) * 100.0 / sum_di)) fout.write('\n') fout.close()
# from scipy.stats import chi2, t, f import numpy as np # Q1 q1_1 = chi2.isf(q=0.95, df=4) assert np.allclose(q1_1, 0.710723) q1_2 = chi2.isf(q=0.05, df=4) assert np.allclose(q1_2, 9.48773) q1_3 = chi2.isf(q=0.95, df=9) assert np.allclose(q1_3, 3.32511) q1_4 = chi2.isf(q=0.05, df=9) assert np.allclose(q1_4, 16.9190) # Q2 q2_1 = t.isf(q=0.05, df=7) assert np.allclose(q2_1, 1.895, rtol=1.e-3) q2_2 = t.isf(q=0.025, df=7) assert np.allclose(q2_2, 2.365, rtol=1.e-3) q2_3 = t.isf(q=0.05, df=12) assert np.allclose(q2_3, 1.782, rtol=1.e-3) q2_4 = t.isf(q=0.025, df=12) assert np.allclose(q2_4, 2.179, rtol=1.e-3) # Q3 q3_1 = f.isf(q=0.05, dfn=5, dfd=7) assert np.allclose(q3_1, 3.9715) q3_2 = f.isf(q=0.95, dfn=5, dfd=7) assert np.allclose(q3_2, 0.2050903422957813) # inverse of F(7,5; 0.05)