def kruskal_test(S,D): protein_array = [] kruskal = [] for protein in list(set(S.index) & set(D.index)): S_val = S[S.index == protein].values[0] if np.isnan(S_val).all(): continue D_val = D[D.index == protein].values[0] if np.isnan(D_val).all(): continue kruskal_p = stats.kruskal(S_val,D_val,nan_policy = "omit")[1] kruskal.append(kruskal_p) protein_array.append(protein) kruskal = pd.DataFrame(kruskal, index = protein_array, columns = ["p"]) kruskal.sort_values(by = "p", inplace=True) kruskal["q"] = qvalues(kruskal, pcolname = "p") return kruskal
def shapiro(df): shapiro = pd.DataFrame(df.apply(stats.shapiro, axis=1).tolist(), index=df.index) shapiro["q"] = qvalues(shapiro, pcolname="pvalue") return shapiro
def kruskal_treatment_groups(S,D, cell_line = "all"): s_1 = S.filter(regex=f"1_{cell_line}*") s_2 = S.filter(regex=f"2_{cell_line}*") s_3 = S.filter(regex=f"3_{cell_line}*") s_4 = S.filter(regex=f"4_{cell_line}*") s_5 = S.filter(regex=f"5_{cell_line}*") s_6 = S.filter(regex=f"6_{cell_line}*") s_7 = S.filter(regex=f"7_{cell_line}*") s_8 = S.filter(regex=f"8_{cell_line}*") s_9 = S.filter(regex=f"9_{cell_line}*") d_1 = D.filter(regex=f"1_{cell_line}*") d_2 = D.filter(regex=f"2_{cell_line}*") d_3 = D.filter(regex=f"3_{cell_line}*") d_4 = D.filter(regex=f"4_{cell_line}*") d_5 = D.filter(regex=f"5_{cell_line}*") d_6 = D.filter(regex=f"6_{cell_line}*") d_7 = D.filter(regex=f"7_{cell_line}*") d_8 = D.filter(regex=f"8_{cell_line}*") d_9 = D.filter(regex=f"9_{cell_line}*") if cell_line == "all": s_1 = S.filter(regex=f"1_.*") s_2 = S.filter(regex=f"2_.*") s_3 = S.filter(regex=f"3_.*") s_4 = S.filter(regex=f"4_.*") s_5 = S.filter(regex=f"5_.*") s_6 = S.filter(regex=f"6_.*") s_7 = S.filter(regex=f"7_.*") s_8 = S.filter(regex=f"8_.*") s_9 = S.filter(regex=f"9_.*") d_1 = D.filter(regex=f"1_.*") d_2 = D.filter(regex=f"2_.*") d_3 = D.filter(regex=f"3_.*") d_4 = D.filter(regex=f"4_.*") d_5 = D.filter(regex=f"5_.*") d_6 = D.filter(regex=f"6_.*") d_7 = D.filter(regex=f"7_.*") d_8 = D.filter(regex=f"8_.*") d_9 = D.filter(regex=f"9_.*") protein_array = [] kruskal_p = [] for protein in d_1.index: try: kruskal = stats.kruskal(s_1[s_1.index == protein], s_2[s_2.index == protein], s_3[s_3.index == protein], s_4[s_4.index == protein], s_5[s_5.index == protein], s_6[s_6.index == protein], s_7[s_7.index == protein], s_8[s_8.index == protein], s_9[s_9.index == protein], d_1[d_1.index == protein], d_2[d_2.index == protein], d_3[d_3.index == protein], d_4[d_4.index == protein], d_5[d_5.index == protein], d_6[d_6.index == protein], d_7[d_7.index == protein], d_8[d_8.index == protein], d_9[d_9.index == protein], nan_policy = "omit") protein_array.append(protein) kruskal_p.append(kruskal[1]) except: continue kruskal_df = pd.DataFrame(kruskal_p, index = protein_array, columns = ["p"]) kruskal_df.sort_values(by="p",inplace=True) kruskal_df["q"] = qvalues(kruskal_df, pcolname="p") return kruskal_df
stats.probplot(Sn.stack().values, dist = "norm", plot=pylab) stats.probplot(Dn.stack().values, dist = "norm", plot=pylab) protein = S.index[0] S_comp = S[S.index == protein] D_comp = D[D.index == protein] anova = stats.f_oneway(S_comp.values[0] ,D_comp.values[0]) len(S) anova_p = stats.f_oneway(S,D,axis=1)[1] anova_p = pd.DataFrame(anova_p, columns = ["p"]) anova_p["p"].hist(bins=100) anova_p = anova_p.sort_values(by = "p") anova_p["q"] = qvalues(anova_p, pcolname = "p") anova_p["q"].hist(bins=1000) anova_p.dropna().q.hist(bins=200) protein # two treatments S_t = S.filter(regex="1_.*") D_t = D.filter(regex="1_.*") def kruskal_test(S,D): protein_array = [] kruskal = [] for protein in list(set(S.index) & set(D.index)): S_val = S[S.index == protein].values[0] if np.isnan(S_val).all():
def shapiro(df): shapiro = pd.DataFrame(df.apply(stats.shapiro, axis=1).tolist(), index=df.index) shapiro["q"] = qvalues(shapiro, pcolname="pvalue") return shapiro shapiro_S = shapiro(S) #Shapiro p > 0.05 indicates normality shapiro_D = shapiro(D) shapiro_S.q.min() shapiro_D.q.min() ttest = pd.DataFrame(stats.ttest_ind(S, D, axis=1), columns=S.index, index=["t", "p"]).T ttest["q"] = qvalues(ttest, pcolname="p") ttest["p"].hist() plt.title("t-test, (RKO C vs T3)") fig, ax = plt.subplots() S.stack().hist(bins=100) plt.title("S all FC (RKO C vs T3)") fig, ax = plt.subplots() D.stack().hist(bins=100) plt.title("D all FC (RKO C vs T3)") D.hist(bins=100) fig, ax = plt.subplots() SD.plot(kind="hist", bins=100, grid=True, alpha=0.7) plt.title("All FC (MCF7 C vs T1)")