def sign(df): des, res = researchpy.ttest(df['diff'], df['poll']) p = res.loc[res['Independent t-test']=='Two side test p value = '].iat[0, 1] if p < 0.05: print('The p value obtained from the t-test is significant(p < 0.05), and therefore, we conclude that there is a difference between the two variables.') else: print('The p value obtained from the t-test is not significant, and therefore, we conclude that There is no difference between the two variables.') print(researchpy.ttest(df['diff'], df['poll']))
def tTest(data, checking, group, group1, group2, nameGroup1, nameGroup2, x, output): output[x]['Variable'] = checking leveneResult = stats.levene(data[checking][data[group] == group1], data[checking][data[group] == group2], center='mean') summary, results = rp.ttest(group1=data[checking][data[group] == group1], group1_name=nameGroup1, group2=data[checking][data[group] == group2], group2_name=nameGroup2) output[x][nameGroup1 + ' N'] = round(summary.iloc[0]['N'], 2) output[x][nameGroup2 + ' N'] = round(summary.iloc[1]['N'], 2) output[x][nameGroup1 + ' Mean'] = round(summary.iloc[0]['Mean'], 2) output[x][nameGroup2 + ' Mean'] = round(summary.iloc[1]['Mean'], 2) output[x][nameGroup1 + ' SD'] = round(summary.iloc[0]['SD'], 2) output[x][nameGroup2 + ' SD'] = round(summary.iloc[1]['SD'], 2) output[x][nameGroup1 + ' SE'] = round(summary.iloc[0]['SE'], 2) output[x][nameGroup2 + ' SE'] = round(summary.iloc[1]['SE'], 2) if leveneResult.pvalue < 0.05: output[x]['Leneve Value'] = str(round(leveneResult.pvalue, 2)) + "****" else: output[x]['Leneve Value'] = str(round(leveneResult.pvalue, 2)) values = results.results output[x]["T-Test P Value"] = signifiant(float(values.loc[[3]])) output[x]["Cohen Effect Size"] = effectSize(float(values.loc[[6]]))
def test_for_side_difference_one_var(df_side, lvl, var, hue='value', plot_dist=True): """performs a ttest on each condition for every variables lvl0 : data = players mean of all valid moves""" import seaborn as sns import researchpy as rp import pandas as pd import matplotlib.pyplot as plt df = get_data_group_by_player_mean(df_side, lvl, var, hue=hue) describe, results = rp.ttest(df['right'], df['left'], paired=True) if plot_dist: sns.distplot(df['right'], norm_hist=True, color='b', label='right - {}'.format(var)) sns.distplot(df['left'], norm_hist=True, color='g', label='left - {}'.format(var)) plt.legend(loc='best') return df, describe, results
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None): df_prop, df_resp = get_prop_resp(treatment) df_prop_ref, df_resp_ref = get_prop_resp("t12a") prop_values = metric(df_resp["min_offer"], df_prop["offer"]) prop_value = metrics.get_mean(prop_values) prop_dss_values = metric(df_resp["min_offer"], df_prop["offer_dss"]) prop_dss_value = metrics.get_mean(prop_dss_values) prop_values_ref = metric(df_resp_ref["min_offer"], df_prop_ref["offer_dss"]) prop_value_ref = metrics.get_mean(prop_values_ref) # auto_dss_values = metric(df_resp["min_offer"], df_prop["ai_offer"]) # auto_dss_value = metrics.get_mean(auto_dss_values) dof = 0 diff = None if is_categorical: table, res = rp.crosstab(pd.Series(prop_values_ref), pd.Series(prop_dss_values), test='chi-square') s, p, r = res.results.values test_label = f"(pearson chi2)" test_label = f"chi2" print("Conclusion: ", generate_cat_stat_sentence(np.mean(resp_dss_values), np.std(resp_dss_values), np.mean(auto_dss_values), np.std(auto_dss_values), s, p, dof, diff=diff, label1=treatment+".dss", label2="t20.dss")) else: table, res = rp.ttest(pd.Series(prop_values_ref), pd.Series(prop_dss_values), paired=False) s = res.results[2] if alternative=="greater": p = res.results[4] elif alternative == "less": p = res.results[5] elif alternative in (None, 'two-sided'): p = res.results[3] r = res.results[9] diff = res.results[0] dof = res.results[1] print("Conclusion: ", generate_stat_sentence(np.mean(resp_dss_values), np.std(resp_dss_values), np.mean(auto_dss_values), np.std(auto_dss_values), s, p, dof, diff=diff, label1=treatment+".dss", label2="t20.dss")) test_label = f"(ttest independent) H0: {'equal' if alternative in {None, 'two-sided'} else alternative}" print("RESUME: ", res) if as_percentage: res = { "Proposer + DSS": f'{100 * prop_dss_value:.2f} %', "T10": f'{100 * prop_value_ref:.2f} %', } else: res = { "Proposer + DSS": f'{prop_dss_value:.2f}', "T10": f'{prop_value_ref:.2f}', } if is_categorical: res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})" else: res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})" return res
def get_rel_responder_min_offer(treatment, con, dfs=None, use_percentage=None, use_labels=None): if SELECTION != "resp": return df_prop, df_resp = get_prop_resp(treatment) df_prop[df_resp.columns] = df_resp _, df_resp_ref = get_prop_resp("t12a") resp_values = df_resp["min_offer_final"] resp_ref_values = df_resp["min_offer"] table, res = rp.ttest(pd.Series(resp_values), pd.Series(resp_ref_values), paired=True) diff = res.results[0] dof = res.results[1] s = res.results[2] p = res.results[3] r = res.results[9] print(res) print( "Conclusion: ", generate_stat_sentence(np.mean(resp_ref_values), np.std(resp_ref_values), np.mean(resp_values), np.std(resp_values), s, p, dof, diff=diff, label1=treatment, label2=treatment + ".dss")) resp_stat = stats.ttest_rel(df_resp["min_offer"], df_resp["min_offer_final"]) resp_stat_t00 = stats.ttest_ind(df_resp["min_offer_final"], df_resp_ref["min_offer"]) resp_wc_stat = stats.wilcoxon(df_resp["min_offer"], df_resp["min_offer_final"]) res = { "mean T12": metrics.get_mean(df_resp["min_offer"]), "mean T13": metrics.get_mean(df_resp["min_offer_final"]), # "rejection_ratio": rejection_ratio(df_prop) } test_label = f"(ttest independent) H0: equal" res = { k: (f"{v:.3f}" if pd.notnull(v) and v != int(v) else v) for k, v in res.items() } res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})" return res
def ttest(dataframe): male = dataframe[dataframe['Sex'] == 'Male'] male.reset_index(inplace= True) male.rename(columns={'Value': 'ValueM'}, inplace=True) female = dataframe[dataframe['Sex'] == 'Female'] female.reset_index(inplace= True) female.rename(columns={'Value': 'ValueF'}, inplace=True) descriptives, results = researchpy.ttest(male['ValueM'], female['ValueF']) print(descriptives) print(results)
def analyze_manipulation(self): """ performs all manipulation check analyses across batches (section Ra) 1. prints manip_acutal mean, std 2. prints manip_chancen mean, std 3. prints plot with mean manip_chance bar + standard error, line for manip_actual 4. prints paired t-test results for manip_acutal and manip_chance returns nothing """ # error checking if self.summary is None: print("You must run .summarize() before running this function") actual = self.summary['manip_actual'] chance = self.summary['manip_chance'] # 1. print manip_actual mean, std print("\n>>> manip_actual mean, standard deviation:") print( f"n: {actual.count()}, mean: {actual.mean()}, std: {actual.std()}") # 2. print manip_chance mean, std print("\n>>> manip_chance mean, standard deviation:") print( f"n: {chance.count()}, mean: {chance.mean()}, std: {chance.std()}") # 3. create barplot print("\n>>> barplot:") plt.figure(figsize=[7.13, 2]) bar = plt.barh(np.arange(1), actual.mean(), align='center', edgecolor="black", height=0.5) plt.errorbar(actual.mean(), np.arange(1), xerr=actual.std(), ecolor="black") plt.title('Manipulation Check Accuracy') plt.yticks(np.arange(1), '') plt.ylabel('Actual Accuracy') plt.xlabel('Accuracy') plt.xlim(left=0, right=1.0) plt.axvline(x=chance.mean(), linewidth=4, color="red", label="Chance Accuracy") #threshold line plt.legend() plt.tight_layout() plt.savefig("manip.pdf", bbox_inches="tight") plt.show() # 4. paired t-test print("\n>>> paired t-test between manip_acutal and manip_chance:") # print(stats.ttest_rel(actual, chance)) tt1 = researchpy.ttest(actual, chance, paired=True)[1] print(tt1)
def raw_pairttest_generate_output_df(mod_raw_data_df): effect_size_df_row_lookup = { "Cohen's d": 6, "Hedge's g": 7, "Glass's delta": 8 } dictionaries_list = [] for pair in global_vars.raw_pairttest_var_pairs: series_time1 = mod_raw_data_df[pair[0]][( mod_raw_data_df[pair[0]].notnull() ) & ( mod_raw_data_df[pair[1]].notnull() )] #this also seems to be done withi nthe researchpy ttest func but it's fine series_time2 = mod_raw_data_df[ pair[1]][(mod_raw_data_df[pair[0]].notnull()) & (mod_raw_data_df[pair[1]].notnull())] result = rp.ttest(series_time1, series_time2, group1_name=pair[0], group2_name=pair[1], equal_variances=True, paired=True) ttest_stats_df = result[1] current_dict = {} current_dict["Variable"] = "{var1} - {var2}".format(var1=pair[0], var2=pair[1]) current_dict["Time1_N"] = series_time1.count() current_dict["Time1_Mean"] = np.mean(series_time1) current_dict["Time1_SD"] = np.std(series_time1) current_dict["Time2_N"] = series_time2.count() current_dict["Time2_Mean"] = np.mean(series_time2) current_dict["Time2_SD"] = np.std(series_time2) current_dict["Degrees of Freedom"] = ttest_stats_df.iloc[1, 1] current_dict["t"] = ttest_stats_df.iloc[2, 1] current_dict[ global_vars. effect_size_choice] = np.nan if global_vars.effect_size_choice == "None" else ttest_stats_df.iloc[ effect_size_df_row_lookup[global_vars.effect_size_choice], 1] current_dict["pvalues"] = ttest_stats_df.iloc[3, 1] dictionaries_list.append(current_dict) output_df = pd.DataFrame(dictionaries_list) return output_df
def get_rel_responder_abs_df(treatment, con, dfs=None, use_percentage=None, use_labels=None): if SELECTION != "resp": return df_prop, df_resp = get_prop_resp(treatment) df_prop[df_resp.columns] = df_resp df_prop_full, df_resp_ref = get_prop_resp("t12a") resp_values = metrics.get_data(metrics.get_rel_responder_abs_df(df_prop)) resp_ref_values = metrics.get_data(metrics.get_rel_responder_abs_df(df_prop_full)) table, res = rp.ttest(pd.Series(resp_values), pd.Series(resp_ref_values), paired=False) s = res.results[2] p = res.results[3] r = res.results[9] diff = res.results[0] dof = res.results[1] s = res.results[2] p = res.results[3] r = res.results[9] print("Conclusion: ", generate_stat_sentence(np.mean(resp_ref_values), np.std(resp_ref_values), np.mean(resp_values), np.std(resp_values), s, p, dof, diff=diff, label1="t12.dss", label2=treatment+".dss")) print("Table:", table) print("Res:", res) res = { "rel. min_offer T12": metrics.get_mean(resp_ref_values), "rel. min_offer T13": metrics.get_mean(resp_values), # "rejection_ratio": rejection_ratio(df_prop) } test_label = f"(ttest independent) H0: equal" res = {k: (f"{v:.3f}" if pd.notnull(v) and v!= int(v) else v) for k,v in res.items()} res["min_offer" + test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})" print() return res
def exp_lc_test(self): lcs = self.data[(self.data['type'] == 'lc')] lcs.reset_index(inplace=True) exps = self.data[(self.data['type'] == 'exp')] exps.reset_index(inplace=True) p_values = {} for param in lcs.columns[4:]: #print(param) descriptives, results = rp.ttest(lcs[param].dropna(how='all'), exps[param].dropna(how='all')) #print(descriptives) #print(results) p_values[param] = [ results.iloc[3, 1], descriptives.loc[0, 'Mean'], descriptives.loc[1, 'Mean'] ] statistics = pd.DataFrame(p_values, index=['p-value', 'LC', 'EXP']) statistics = statistics.reindex_axis(self.column_order, axis=1) print(statistics) statistics.to_csv('statistics.csv')
# make predictions for test dataset y_pred = rf.predict(X_test) # calculate accuracy score print(accuracy_score(y_test, y_pred) * 100) # create LIME explainer explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=features, class_names=class_names, discretize_continuous=True) # i is the record we explain i = 0 exp = explainer.explain_instance(X_test.values[i], rf.predict_proba, num_features=8) exp.show_in_notebook(show_table=True, show_all=True) # separate postive and negative outcomes for independent t-test positive_outcome = data[data['Outcome'] == 1] negative_outcome = data[data['Outcome'] == 0] # rename 'Glucose' field to clarify test results positive_outcome = positive_outcome.rename({'Glucose': 'Positive_Glucose'}, axis=1) negative_outcome = negative_outcome.rename({'Glucose': 'Negative_Glucose'}, axis=1) # run t-test to determine difference of means in glucose for positive/negative outcomes descriptive_stats, test_results = rp.ttest( negative_outcome['Negative_Glucose'], positive_outcome['Positive_Glucose']) print(descriptive_stats) print(test_results)
# fisher # resultDf = resultDf[resultDf['participantsType'] != "machine"] # crosstab, res = researchpy.crosstab(resultDf['participantsType'], resultDf['firstIntentionConsistFinalGoal'], test='fisher') # print(crosstab) # print(res) # df['trialType'] = [ 'Critical Disruption' if trial == "special" else 'Random Disruptions' for trial in df['noiseNumber'] ] statDF = pd.DataFrame() statDF['commitmentRatio'] = df.groupby( ['name', 'trialType', 'participantsType'], sort=False)["firstIntentionConsistFinalGoal"].mean() statDF['commitmentRatio'] = statDF.apply( lambda x: int(x["commitmentRatio"] * 100), axis=1) statDF = statDF.reset_index() # t-test humanDF = statDF[(statDF.participantsType == "Humans") & (statDF.trialType == 'Critical Disruption')] rLDF = statDF[(statDF.participantsType == "RL") & (statDF.trialType == 'Critical Disruption')] des, res = researchpy.ttest(humanDF['commitmentRatio'], rLDF['commitmentRatio']) print(des) print(res)
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None): df_prop, df_resp = get_prop_resp(treatment) df_prop_t10, df_resp_t10 = get_prop_resp("t10a") metric_values = metric(df_prop) metric_value = metrics.get_mean(metric_values) metric_t10_values = metric(df_prop_t10) metric_value_t10 = metrics.get_mean(metric_t10_values) metric_values = metrics.get_data(metric_values) metric_t10_values = metrics.get_data(metric_t10_values) #print(stats.chisquare(metric_values[:103], metric_t10_values[:103])) dof = 0 diff = None print(metric.__name__) if is_categorical: #table, res = rp.crosstab(pd.Series(metric_values), pd.Series(metric_t10_values), test='g-test') table, res = rp.crosstab(pd.Series(metric_values), pd.Series(metric_t10_values), test='fisher') #print(table, res) #s, p, r = res.results s = res.results[0] p = res.results[1] r = res.results[4] test_label = f"(g-test chi2)" print( "Conclusion: ", generate_cat_stat_sentence(np.mean(metric_t10_values), np.std(metric_t10_values), np.mean(metric_values), np.std(metric_values), s, p, dof, diff=diff, label1="t10a.dss", label2=treatment + ".dss")) print( pd.crosstab(pd.Series(metric_t10_values), pd.Series(metric_values))) else: table, res = rp.ttest(pd.Series(metric_t10_values), pd.Series(metric_values), paired=False) s = res.results[2] if alternative == "greater": p = res.results[4] elif alternative == "less": p = res.results[5] elif alternative in (None, 'two-sided'): p = res.results[3] r = res.results[9] diff = res.results[0] dof = res.results[1] s = res.results[2] p = res.results[3] r = res.results[9] print( "Conclusion: ", generate_stat_sentence(np.mean(metric_t10_values), np.std(metric_t10_values), np.mean(metric_values), np.std(metric_values), s, p, dof, diff=diff, label1="t10a.dss", label2=treatment + ".dss")) test_label = f"(ttest independent) H0: {'equal' if alternative in {None, 'two-sided'} else alternative}" print("TABLE: ", table) print("TEST: ", res) if as_percentage: res = { "Proposer + DSS": f'{100 * metric_value:.2f} %', "T10": f'{100 * metric_value_t10:.2f} %', } else: res = { "Proposer + DSS": f'{metric_value:.2f}', "T10": f'{metric_value_t10:.2f}', } if is_categorical: res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})" else: res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})" return res
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None): df_prop, df_resp = get_prop_resp(treatment) prop_values = metric(df_resp["min_offer"], df_prop["offer"]) prop_value = metrics.get_mean(prop_values) prop_dss_values = metric(df_resp["min_offer"], df_prop["offer_dss"]) prop_dss_value = metrics.get_mean(prop_dss_values) auto_dss_values = metric(df_resp["min_offer"], df_prop["ai_offer"]) auto_dss_value = metrics.get_mean(auto_dss_values) dof = 0 diff = None print(metric) if is_categorical: table = pd.crosstab(prop_values, prop_dss_values) # print("TABLE: ", table) # checked using: http://vassarstats.net/propcorr.html # s, p = sms2.mcnemar(prop_values, prop_dss_values, exact=False, correction=False) table, res = rp.crosstab(prop_values, prop_dss_values, test='mcnemar') #chi, p, s = (res.results.values) s, p, r = (res.results.values) print("Conclusion: ", generate_stat_sentence(np.mean(prop_values), np.std(prop_values), np.mean(prop_dss_values), np.std(prop_dss_values), s, p, dof, diff=diff, label1=treatment, label2=treatment+".dss")) test_label = f"(mcnemar - chi2)" else: s, p = stats.wilcoxon(prop_values, prop_dss_values, alternative=alternative or 'two-sided') table, res = rp.ttest(pd.Series(prop_values), pd.Series(prop_dss_values), paired=True) #res = rp.ttest(pd.Series(prop_values), pd.Series(prop_dss_values), paired=True) diff = res.results[0] dof = res.results[1] s = res.results[2] p = res.results[3] r = res.results[9] test_label = f"(ttest dependent)" print("Conclusion: ", generate_stat_sentence(np.mean(prop_values), np.std(prop_values), np.mean(prop_dss_values), np.std(prop_dss_values), s, p, dof, diff=diff, label1=treatment, label2=treatment+".dss")) print("TABLE:", table) print("RES:", res) if as_percentage: res = { "Proposer": f'{100 * prop_value:.2f} %', "Proposer + DSS": f'{100 * prop_dss_value:.2f} %', # "prop:dss - prop": f'{100 * (prop_dss_value - prop_value):.2f} %', } else: res = { "Proposer": f'{prop_value:.2f}', "Proposer + DSS": f'{prop_dss_value:.2f}', # "prop:dss - prop": f'{(prop_dss_value - prop_value):.2f} %', } if is_categorical: res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})" else: res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})" return res
#collect data of control group control_group_original = post_df[post_df['QUESTNNR'] == 'group1'] control_group = control_group_original.dropna(axis=1, how='all') control_group = control_group.drop(['CASE','QUESTNNR','STARTED'], axis=1).reset_index(drop=True) control_group.columns = share.CONTROL_COLUMN_NAME share.export_to_csv(control_group, 'control_group.csv') #collect data of experiment group experiment_group_original = post_df[post_df['QUESTNNR'] == 'qnr2'] experiment_group = experiment_group_original.dropna(axis=1,how='all') experiment_group = experiment_group.drop(['CASE', 'QUESTNNR','STARTED'], axis=1).reset_index(drop=True) experiment_group.columns = share.EXPERIMENT_COLUMN_NAME share.export_to_csv(experiment_group, 'experiment_group.csv') #statistical analysis of scripts data by using t-test script1_des, script1_res = researchpy.ttest(control_group['script1'].astype('int32'),experiment_group['script1'].astype('int32'),group1_name='control_script1',group2_name='exp_script1',equal_variances=False) script2_des, script2_res = researchpy.ttest(control_group['script2'].astype('int32'),experiment_group['script2'].astype('int32'),group1_name='control_script2',group2_name='exp_script2',equal_variances=False) script3_des, script3_res = researchpy.ttest(control_group['script3'].astype('int32'),experiment_group['script3'].astype('int32'),group1_name='control_script3',group2_name='exp_script3',equal_variances=False) control_script = control_group.loc[:,['script1','script2','script3']].astype('int32').mean(axis=1) experiment_script = experiment_group.loc[:,['script1','script2','script3']].astype('int32').mean(axis=1) script_des, script_res = researchpy.ttest(control_script,experiment_script, group1_name ='control', group2_name='exp',equal_variances=False) #%% export analysis results share.export_to_csv(script1_des, 'script1_des.csv') share.export_to_csv(script1_res, 'script1_res.csv') share.export_to_csv(script2_des, 'script2_des.csv') share.export_to_csv(script2_res, 'script2_des.csv') share.export_to_csv(script3_des, 'script3_des.csv') share.export_to_csv(script3_res, 'script3_res.csv') share.export_to_csv(script_des, 'script_des.csv') share.export_to_csv(script_res,'script_res.csv')
def analyze_viability(self, *diff_args: [int, int, str]): """ performs all raw viability score analyses across batches (section Rb split) 1. prints refPair1[0] mean, std 2. prints refPair1[1] mean, std 3. prints refPair2[0] mean, std 4. prints refPair2[1] mean, std 5. prints bar plot of v2=R/B, v2=D/W means + std error 6. prints box plot of v2=R/B, v2=D/W 7. prints paired t-test results for initial scores R/B vs D/W 8. prints paired t-test results for later scores R/B vs D/W returns nothing """ # error checking if self.summary is None: print("You must run .summarize() before running this function") r1 = self.summary["initial_" + self.viability_labels[0]] r2 = self.summary["later_" + self.viability_labels[0]] d1 = self.summary["initial_" + self.viability_labels[1]] d2 = self.summary["later_" + self.viability_labels[1]] # 1. print r1 mean, std print( f"\n>>> initial_{self.viability_labels[0]} mean, standard deviation:" ) print(f"n: {r1.count()}, mean: {r1.mean()}, std: {r1.std()}") # 2. print r2 mean, std print( f"\n>>> later_{self.viability_labels[0]} mean, standard deviation:" ) print(f"n: {r2.count()}, mean: {r2.mean()}, std: {r2.std()}") # 3. print d1 mean, std print( f"\n>>> initial_{self.viability_labels[1]} mean, standard deviation:" ) print(f"n: {d1.count()}, mean: {d1.mean()}, std: {d1.std()}") # 4. print d2 mean, std print( f"\n>>> later_{self.viability_labels[1]} mean, standard deviation:" ) print(f"n: {d2.count()}, mean: {d2.mean()}, std: {d2.std()}") title = "Viability Scores of Rounds" labels=[textwrap.fill(text, 12) for text in \ ["Initial " + self.viability_labels[0],"Later " + self.viability_labels[0], \ "Initial " + self.viability_labels[1], "Later " + self.viability_labels[1]]] ylabel = "Team Mean Viability" order = [r1, r2, d1, d2] means = [x.mean() for x in order] stds = [x.std() for x in order] maxs = [x.max() for x in order] # 5. create barplot print("\n>>> barplot:") plt.figure(1) bar = plt.bar(np.arange(4), means, yerr=stds, align='center') plt.title(title) plt.xticks(np.arange(4), labels) plt.ylabel(ylabel) plt.savefig("raw_bar.pdf") plt.show() # 6. create boxplot print("\n>>> boxplot:") plt.figure(2) box = plt.boxplot(order, positions=np.arange(4)) plt.title(title) plt.xticks(np.arange(4), labels) plt.ylabel(ylabel) plt.ylim(top=70, bottom=14) # label diffs for args in diff_args: label_diff(*args, maxs) plt.savefig("raw_box.pdf") plt.show() # 7. paired t-test initial print( f"\n>>> paired t-test between initial_{self.viability_labels[0]} and initial_{self.viability_labels[1]}:" ) tt1 = researchpy.ttest(r1, d1, paired=True)[1] print(tt1) # 8. paired t-test later print( f"\n>>> paired t-test between later_{self.viability_labels[0]} and later_{self.viability_labels[1]}:" ) tt1 = researchpy.ttest(r2, d2, paired=True)[1] print(tt1)
#The fisrt distribution is far from normal but the second one is normal #t_test def t_test(x, y): if stats.ttest_ind(x, y)[1] < 0.05: return print('Greece is flattening the curve!') t_test(df2['Total Confirm of new cases'], df1['Total Confirm of new cases']) #By the statistical test we made observe that there is difference between the two curves and the second one which denotes #the after the measures period is more flatten descriptives, results = rp.ttest(df2['Total Confirm of new cases'], df1['Total Confirm of new cases']) descriptives # In[21]: #fitting the initial dataset plt.figure(figsize=(11, 7)) plt.plot(xdata, ydata, 'ko', label='data') #the original datapoints #model1 popt, pcov = curve_fit(f=func, xdata=xdata, ydata=ydata, p0=None, sigma=None) print(popt) # parameters print(pcov) # covariance
import pandas from scipy import stats import researchpy # %% control_df = pandas.read_csv(share.CHAT_MSG_CONTROL) experiment_df = pandas.read_csv(share.CHAT_MSG_EXP).drop(columns='marker') # %% T2_control_df = control_df.loc[control_df['chat_id'] == 'T2']['num_msg'] T2_experiment_df = experiment_df.loc[experiment_df['chat_id'] == 'T2']['num_msg'] T2_df = pandas.DataFrame({'con_T2': T2_control_df, 'exp_T2': T2_experiment_df}) # %% T2_des, T2_res = researchpy.ttest(T2_df['exp_T2'], T2_df['con_T2']) #%% T3_control_df = control_df.loc[control_df['chat_id'] == 'T3']['num_msg'] T3_experiment_df = experiment_df.loc[experiment_df['chat_id'] == 'T3']['num_msg'] T3_df = pandas.DataFrame({'con_T3': T3_control_df, 'exp_T3': T3_experiment_df}) #%% T3_des, T3_res = researchpy.ttest(T3_df['con_T3'], T3_df['exp_T3']) # %% control_group = control_df.groupby('user_name') control_group = control_group.apply( lambda df: df.iloc[[1, 2]]['num_msg'].sum()).reset_index(drop=True) # %%
def raw_indttest_generate_output_df(mod_raw_data_df): group1_df = mod_raw_data_df[mod_raw_data_df[ global_vars.raw_indttest_groupvar] == global_vars.raw_indttest_grouplevel1] group2_df = mod_raw_data_df[mod_raw_data_df[ global_vars.raw_indttest_groupvar] == global_vars.raw_indttest_grouplevel2] effect_size_df_row_lookup = { "Cohen's d": 6, "Hedge's g": 7, "Glass's delta": 8 } dictionaries_list = [] for var in global_vars.raw_indttest_dv: result = rp.ttest( group1_df[var], group2_df[var], group1_name=global_vars.raw_indttest_grouplevel1, group2_name=global_vars.raw_indttest_grouplevel2, equal_variances=stats.levene( group1_df[var].dropna().reset_index(drop=True), group2_df[var].dropna().reset_index(drop=True))[1] > 0.05, paired=False) ttest_stats_df1 = result[0] ttest_stats_df2 = result[1] current_dict = {} current_dict["Variable"] = var current_dict["All_N"] = int(ttest_stats_df1[ ttest_stats_df1["Variable"] == "combined"].iloc[0]["N"]) current_dict["All_Mean"] = ttest_stats_df1[ ttest_stats_df1["Variable"] == "combined"].iloc[0]["Mean"] current_dict["All_SD"] = ttest_stats_df1[ttest_stats_df1["Variable"] == "combined"].iloc[0]["SD"] current_dict[global_vars.raw_indttest_grouplevel1 + "_N"] = int( ttest_stats_df1[ttest_stats_df1["Variable"] == global_vars.raw_indttest_grouplevel1].iloc[0]["N"]) current_dict[global_vars.raw_indttest_grouplevel1 + "_Mean"] = ttest_stats_df1[ ttest_stats_df1["Variable"] == global_vars.raw_indttest_grouplevel1].iloc[0]["Mean"] current_dict[global_vars.raw_indttest_grouplevel1 + "_SD"] = ttest_stats_df1[ ttest_stats_df1["Variable"] == global_vars.raw_indttest_grouplevel1].iloc[0]["SD"] current_dict[global_vars.raw_indttest_grouplevel2 + "_N"] = int( ttest_stats_df1[ttest_stats_df1["Variable"] == global_vars.raw_indttest_grouplevel2].iloc[0]["N"]) current_dict[global_vars.raw_indttest_grouplevel2 + "_Mean"] = ttest_stats_df1[ ttest_stats_df1["Variable"] == global_vars.raw_indttest_grouplevel2].iloc[0]["Mean"] current_dict[global_vars.raw_indttest_grouplevel2 + "_SD"] = ttest_stats_df1[ ttest_stats_df1["Variable"] == global_vars.raw_indttest_grouplevel2].iloc[0]["SD"] current_dict["Degrees_of_Freedom"] = ttest_stats_df2.iloc[1, 1] current_dict["t"] = ttest_stats_df2.iloc[2, 1] current_dict[ global_vars. effect_size_choice] = np.nan if global_vars.effect_size_choice == "None" else ttest_stats_df2.iloc[ effect_size_df_row_lookup[global_vars.effect_size_choice], 1] current_dict["pvalues"] = ttest_stats_df2.iloc[3, 1] dictionaries_list.append(current_dict) output_df = pd.DataFrame(dictionaries_list) return output_df
def t_test_for_two_EX_with_unknown_but_equal_variance(self, p_group1, p_group2): # n = p_group1.count() * p_group2.count() # std_ = ( (p_group1.std()**2 * (p_group1.count()-1) + p_group2.std()**2 * (p_group2.count()-1)) / ( n - 2) )**0.5 # test_statitik = ((p_group1.mean() - p_group2.mean()) / std_) * (p_group1.count() * p_group2.count() / n )**0.5 # t_quantil = scipy.stats.t.ppf(1 - alpha/2, n - 2) return rp.ttest(group1 = p_group1, group1_name = p_group1.name, group2 = p_group2, group2_name = p_group2.name)
def analyze_viability_early(self, *diff_args: [int, int, str]): """ performs analyze_viability but with early viability score (average of initial) really just a study 1 figure generator 1. prints refPair1[0] mean, std 2. prints refPair1[1] mean, std 3. prints median mean, std 4. prints bar plot 5. prints box plot 6. prints paired t-test results for refPair1[0] and refPair1[1] 7. prints paired t-test results for median and refPair1[1] returns nothing """ # error checking if self.summary is None: print("You must run .summarize() before running this function") r1 = self.summary["initial_" + self.viability_labels[0]] r2 = self.summary["later_" + self.viability_labels[0]] m = self.summary["median"] # 1. print r1 mean, std print( f"\n>>> initial_{self.viability_labels[0]} mean, standard deviation:" ) print(f"n: {r1.count()}, mean: {r1.mean()}, std: {r1.std()}") # 2. print r2 mean, std print( f"\n>>> later_{self.viability_labels[0]} mean, standard deviation:" ) print(f"n: {r2.count()}, mean: {r2.mean()}, std: {r2.std()}") # 3. print m mean, std print(f"\n>>> median round mean, standard deviation:") print(f"n: {m.count()}, mean: {m.mean()}, std: {m.std()}") title = "Viability Scores of Rounds" labels=[textwrap.fill(text, 12) for text in \ ["Best Initial Round","Reconvened Round","Median Round"]] ylabel = "Team Mean Viability" order = [r1, r2, m] means = [x.mean() for x in order] stds = [x.std() for x in order] maxs = [x.max() for x in order] # 4. create barplot print("\n>>> barplot:") plt.figure(1) bar = plt.bar(np.arange(3), means, yerr=stds, align='center') plt.title(title) plt.xticks(np.arange(3), labels) plt.ylabel(ylabel) plt.savefig("early_bar.pdf") plt.show() # 5. create boxplot print("\n>>> boxplot:") plt.figure(2) box = plt.boxplot(order, positions=np.arange(3)) plt.title(title) plt.xticks(np.arange(3), labels) plt.ylabel(ylabel) plt.ylim(top=70, bottom=14) # label diffs for args in diff_args: label_diff(*args, maxs) plt.savefig("early_box.pdf") plt.show() # 6. paired t-test r1 and r2 print( f"\n>>> paired t-test between initial and reconvened {self.viability_labels[0]}:" ) # print(stats.ttest_rel(r2, e)) tt1 = researchpy.ttest(r1, r2, paired=True)[1] print(tt1) # 7. paired t-test median and r2 print( f"\n>>> paired t-test between median and reconvened {self.viability_labels[0]}:" ) # print(stats.ttest_rel(r2, e)) tt1 = researchpy.ttest(m, r2, paired=True)[1] print(tt1) # return data for exterior processing return order
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None): df_prop, df_resp = get_prop_resp(treatment) df_prop_t20, df_resp_t20 = get_prop_resp("t20a") # prop_values = metric(df_resp["min_offer_dss"], df_prop["offer"]) # prop_value = metrics.get_mean(prop_values) prop_dss_values = metric(df_resp["min_offer_dss"], df_prop["offer_dss"]) prop_dss_value = metrics.get_mean(prop_dss_values) auto_dss_values = metric(df_resp_t20["min_offer_dss"], df_prop_t20["ai_offer"]) auto_dss_value = metrics.get_mean(auto_dss_values) dof = 0 diff = None if is_categorical: # table = np.array([np.bincount(prop_values), np.bincount(prop_dss_values)]) # print("TABLE: ", table) # checked using: http://vassarstats.net/propcorr.html # s, p = sms2.mcnemar(prop_values, prop_dss_values, exact=False, correction=False) table, res = rp.crosstab(prop_dss_values, auto_dss_values, test='mcnemar') s, p, r = res.results.values test_label = f"(mcnemar) H0: equal, Ha: {'two-sided'}" print( "Conclusion: ", generate_cat_stat_sentence(np.mean(prop_dss_values), np.std(prop_dss_values), np.mean(auto_dss_values), np.std(auto_dss_values), s, p, dof, diff=diff, label1=treatment + ".dss", label2="t20.dss")) else: #s, p = stats.wilcoxon(prop_values, auto_dss_values, alternative=alternative or 'two-sided') table, res = rp.ttest(pd.Series(prop_dss_values), pd.Series(auto_dss_values), paired=False) test_label = f"(wilcoxon) H0: equal, Ha: {alternative or 'two-sided'}" diff = res.results[0] dof = res.results[1] s = res.results[2] p = res.results[3] r = res.results[9] print( "Conclusion: ", generate_stat_sentence(np.mean(prop_dss_values), np.std(prop_dss_values), np.mean(auto_dss_values), np.std(auto_dss_values), s, p, dof, diff=diff, label1=treatment + ".dss", label2="t20.dss")) if as_percentage: res = { # "Proposer": f'{100 * prop_value:.2f} %', "Proposer + DSS": f'{100 * prop_dss_value:.2f} %', "T20 Auto DSS": f'{100 * auto_dss_value:.2f} %', "prop:dss - auto prop": f'{100 * (prop_dss_value - auto_dss_value):.2f} %', } else: res = { # "Proposer": f'{prop_value:.2f}', "Proposer + DSS": f'{prop_dss_value:.2f}', "T20 Auto DSS": f'{auto_dss_value:.2f}', "prop:dss - auto prop": f'{(prop_dss_value - auto_dss_value):.2f} %', } if is_categorical: res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})" else: res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})" return res
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None): df_prop, df_resp = get_prop_resp(treatment) df_prop_ref, df_resp_ref = get_prop_resp("t11a") print(metric.__name__) metric_values = metric(df_prop) metric_value = metrics.get_mean(metric_values) metric_ref_values = metric(df_prop_ref) metric_value_ref = metrics.get_mean(metric_ref_values) metric_values = metrics.get_data(metric_values) metric_ref_values = metrics.get_data(metric_ref_values) dof = 0 diff = None if is_categorical: table, res = rp.crosstab(pd.Series(metric_ref_values), pd.Series(metric_values), test='chi-square') s, p, r = res.results.values print( "Conclusion: ", generate_cat_stat_sentence(np.mean(metric_ref_values), np.std(metric_ref_values), np.mean(metric_values), np.std(metric_values), s, p, dof, diff=diff, label1="t11a.dss", label2=treatment + ".dss")) test_label = f"(pearson chi2)" else: #print("Ranksums", stats.ranksums(metric_ref_values, metric_values)) table, res = rp.ttest(pd.Series(metric_ref_values), pd.Series(metric_values), paired=False) s = res.results[2] if alternative == "greater": p = res.results[4] elif alternative == "less": p = res.results[5] elif alternative in (None, 'two-sided'): p = res.results[3] r = res.results[9] diff = res.results[0] dof = res.results[1] s = res.results[2] p = res.results[3] r = res.results[9] print( "Conclusion: ", generate_stat_sentence(np.mean(metric_ref_values), np.std(metric_ref_values), np.mean(metric_values), np.std(metric_values), s, p, dof, diff=diff, label1="t11a.dss", label2=treatment + ".dss")) test_label = f"(ttest independent) H0: {'equal' if alternative in {None, 'two-sided'} else alternative}" print("RESUME: ", res) print("TABLE: ", table) if as_percentage: res = { # "Proposer": f'{100 * prop_value:.2f} %', "Proposer + DSS": f'{100 * metric_value:.2f} %', "T11A ": f'{100 * metric_value_ref:.2f} %', "prop:dss - auto prop": f'{100 * (metric_value - metric_value_ref):.2f} %', } else: res = { # "Proposer": f'{prop_value:.2f}', "Proposer + DSS": f'{metric_value:.2f}', "T11A": f'{metric_value_ref:.2f}', "prop:dss - auto prop": f'{(metric_value - metric_value_ref):.2f} %', } res[test_label] = f"{s:.3f} ({p:.3f})" if is_categorical: res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})" else: res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})" return res
def find_info_for_all_tVst(key, count): df_tvst = df.loc[df.prev_treat_next == key] print(Color.BOLD, count, ". The following pre_treat_post combination is being searched :", Color.END, Color.PURPLE, key, "[treat-post-pre] :: ", df_tvst.shape, Color.END) df_tvst.drop(['prev_treat_next', 'Unnamed: 0', 'user_id'], axis=1, inplace=True) # listing the set of columns, just to be sure, after dropping the irrelevant ones # print(df_tvst.columns) # This function gets the unique instances in the following columns # 1. ts_owner_id : id of the teacher who made the tutoring # 2. pl_p_problem_id : id of the pre test # 3. plta_problem_id : id of the treatment problem # 4. pl_n_problem_id : id of the post test def get_Unique_info(column_name): unique_instances = df_tvst[column_name].unique() # print("--------more info on " + column_name + "------------") # print(df_tvst[column_name].value_counts()) return unique_instances unique_teacher_ids = get_Unique_info('ts_owner_id') pre_problem_ids = get_Unique_info('pl_p_problem_id') treatment_problem_ids = get_Unique_info('plta_problem_id') post_problem_ids = get_Unique_info('pl_n_problem_id') # print("--------teacher-ids------------------", unique_teacher_ids) # print("--------previous-ids------------------", pre_problem_ids) # print("--------treatment-ids------------------", treatment_problem_ids) # print("--------post-ids------------------", post_problem_ids) # print("------------teacher counts---------------------") # print(df_tvst['ts_owner_id'].value_counts()) df_tvst["avg_p_score_question"] = 0 df_tvst["avg_a_score_question"] = 0 df_tvst["avg_n_score_question"] = 0 # This function finds the average question score for all pre, treatment and post # we did this because the average score is indicative of the difficulty of the problem def find_avg_question_score(unique_id, column_name, avg_col, correctness_column): mean = df_tvst.loc[( df_tvst[column_name] == unique_id)][correctness_column].mean() df_tvst.loc[df_tvst[column_name] == unique_id, avg_col] = mean # this was done because initially there could have been multiple ids in a single treatment/pre/post condition # which is no longer the case # TODO: Hence, we might actually remove this whole code block during code refactoring def calculate_the_average(ids, column_name, avg_for_column, correctness_column): for val in ids: find_avg_question_score(val, column_name, avg_for_column, correctness_column) calculate_the_average(pre_problem_ids, "pl_p_problem_id", "avg_p_score_question", "pl_p_correct") calculate_the_average(treatment_problem_ids, "plta_problem_id", "avg_a_score_question", "plta_correct") calculate_the_average(post_problem_ids, "pl_n_problem_id", "avg_n_score_question", "pl_n_correct") # average score is indicative of the difficulty of the problem avg_p_score = df_tvst["avg_p_score_question"].unique() avg_a_score = df_tvst["avg_a_score_question"].unique() avg_n_score = df_tvst["avg_n_score_question"].unique() # print("average pre score", avg_p_score) # print("average treatment score", avg_a_score) # print("average post score", avg_n_score) df_teacher_sign_test = pd.DataFrame(columns=[ "teacher_id", "total_treated_exposed", "pre_test", "post_test", "+/-" ]) df_teacher_sign_test_treatment = pd.DataFrame(columns=[ "teacher_id", "total_treated_used", "pre_test", "post_test", "+/-" ]) for i in range(unique_teacher_ids.size): df_teacher_sign_test = df_teacher_sign_test.append( { "teacher_id": unique_teacher_ids[i], "post_test": 0, "post_score(avg)": 0, "pre_test": 0, "pre_score(avg)": 0, "+/-": 0, "total_treated_exposed": 0 }, ignore_index=True) df_teacher_sign_test_treatment = df_teacher_sign_test_treatment.append( { "teacher_id": unique_teacher_ids[i], "post_test": 0, "post_score(avg)": 0, "pre_test": 0, "pre_score(avg)": 0, "+/-": 0, "total_treated_used": 0 }, ignore_index=True) def extract_information_perTeacher_perTreatmentQuestion(teacher_id): df_teacher_specific = df_tvst.loc[df_tvst.ts_owner_id == teacher_id] pre_count = (df_teacher_specific.pl_p_correct == 1).sum() post_count = (df_teacher_specific.pl_n_correct == 1).sum() mean_p = df_teacher_specific.loc[ df_teacher_specific.ts_owner_id == teacher_id]["avg_p_score_question"].mean() mean_n = df_teacher_specific.loc[ df_teacher_specific.ts_owner_id == teacher_id]["avg_n_score_question"].mean() df_teacher_sign_test.loc[df_teacher_sign_test.teacher_id == teacher_id, "pre_test"] = pre_count df_teacher_sign_test.loc[df_teacher_sign_test.teacher_id == teacher_id, "pre_score(avg)"] = mean_p df_teacher_sign_test.loc[df_teacher_sign_test.teacher_id == teacher_id, "post_test"] = post_count df_teacher_sign_test.loc[df_teacher_sign_test.teacher_id == teacher_id, "post_score(avg)"] = mean_n df_teacher_sign_test.loc[df_teacher_sign_test.teacher_id == teacher_id, "total_treated_exposed"] = len( df_teacher_specific.pl_n_correct) df_teacher_sign_test.loc[df_teacher_sign_test.teacher_id == teacher_id, "+/-"] = (post_count - pre_count) df_teacher_specific = df_teacher_specific.loc[ (df_teacher_specific.plta_hint_count > 0) | (df_teacher_specific.plta_bottom_hint > 0)] pre_count = (df_teacher_specific.pl_p_correct == 1).sum() post_count = (df_teacher_specific.pl_n_correct == 1).sum() mean_p_t = df_teacher_specific.loc[ df_teacher_specific.ts_owner_id == teacher_id]["avg_p_score_question"].mean() mean_n_t = df_teacher_specific.loc[ df_teacher_specific.ts_owner_id == teacher_id]["avg_n_score_question"].mean() df_teacher_sign_test_treatment.loc[ df_teacher_sign_test_treatment.teacher_id == teacher_id, "pre_test"] = pre_count df_teacher_sign_test_treatment.loc[ df_teacher_sign_test_treatment.teacher_id == teacher_id, "pre_score(avg)"] = mean_p_t df_teacher_sign_test_treatment.loc[ df_teacher_sign_test_treatment.teacher_id == teacher_id, "post_test"] = post_count df_teacher_sign_test_treatment.loc[ df_teacher_sign_test_treatment.teacher_id == teacher_id, "post_score(avg)"] = mean_n_t df_teacher_sign_test_treatment.loc[ df_teacher_sign_test_treatment.teacher_id == teacher_id, "+/-"] = (post_count - pre_count) df_teacher_sign_test_treatment.loc[ df_teacher_sign_test_treatment.teacher_id == teacher_id, "total_treated_used"] = len(df_teacher_specific.pl_n_correct) df_teacher_sign_test.fillna(0, inplace=True) df_teacher_sign_test_treatment.fillna(0, inplace=True) for val in unique_teacher_ids: extract_information_perTeacher_perTreatmentQuestion(val) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) n_largest = df_teacher_sign_test_treatment['total_treated_used'].nlargest( 2) # if (df_teacher_sign_test_treatment['total_treated_used'].max()) > 20: if (n_largest.min()) > 40: print(Color.RED, "\n", 'This is a good Data more than 15 used instances', Color.END) print(n_largest.max(), " min : ", n_largest.min()) # print("--------------------------------That were in the treatment condition-----------------------------------") # print(df_teacher_sign_test) print( "---------------------------------That used the treatment condition-------------------------------------" ) print(df_teacher_sign_test_treatment) X = np.arange(len(df_teacher_sign_test_treatment.teacher_id)) plt.bar(X + 0.0, df_teacher_sign_test_treatment.pre_test, color='b', width=0.3, label='pretest score') plt.bar(X + 0.3, df_teacher_sign_test_treatment.post_test, color='g', width=0.3, label='posttest score') plt.bar(X + 0.6, df_teacher_sign_test_treatment.total_treated_used, color='r', width=0.3, label='total_treatment_used') for id in df_teacher_sign_test_treatment.teacher_id: if id in id_name.keys(): df_teacher_sign_test_treatment.loc[ df_teacher_sign_test_treatment.teacher_id == id, "teacher_id"] = id_name[id] plt.xticks(X + 0.15, df_teacher_sign_test_treatment.teacher_id) plt.title(str(count) + ". figure") plt.legend() plt.show() for teacher_id in unique_teacher_ids: df_teacher_specific = df_tvst.loc[df_tvst.ts_owner_id == teacher_id] df_teacher_specific = df_teacher_specific.loc[ (df_teacher_specific.plta_hint_count > 0) | (df_teacher_specific.plta_bottom_hint > 0)] print( "-------------------------------------------------------------------------------------------------------" ) print(Color.BOLD, Color.GREEN, id_name[teacher_id], Color.END, Color.END) descriptives, results = rp.ttest(df_teacher_specific.pl_n_correct, df_teacher_specific.pl_p_correct) print( "-------------------------------------------------------------------------------------------------------" ) print(descriptives) print( "-------------------------------------------------------------------------------------------------------" ) print(results) print( "-------------------------------------------------------------------------------------------------------" ) print("\n")
features corrected based on Bonferroni Documentation can be found at https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html reject_t: true for hypothesis that can be rejected for given alpha pvals_corrected_t: p-values corrected for multiple tests corrected_hypotesis: corrected alpha for Bonferroni method rejected_t_dic: dictionary filtered by reject_t values equals True """ features_pv_05=dict() print(type(features_pv_05)) exceptions=dict() for i in data_train_anatomy._get_numeric_data(): try: descriptives, results = rp.ttest(df_ASD[i], df_TC[i]) p_val=(results["results"][3]) if (p_val<=0.05) and (p_val!=0.0): features_pv_05[i]=p_val except ZeroDivisionError: exceptions[i]=p_val features_dict_p_val_05=(dict(sorted(features_pv_05.items(), key = lambda x : x[1]))) ## Correction Bonferroni reject_t, pvals_corrected_t,alphacSidak_t, alphacBonf_t = smm.multipletests((list(features_dict_p_val_05.values())),alpha=0.05, method='b',returnsorted=True) corrected_hypotesis= dict((key, value) for (key, value) in zip(features_pv_05.keys(), reject_t)) rejected_t_dic = {k: v for k, v in corrected_hypotesis.items() if v==True} corrected_pvalues=dict((key, value) for (key, value) in zip(rejected_t_dic.keys(), pvals_corrected_t)) corrected_p_dic = {k: v for k, v in corrected_pvalues.items() }
def stat_groups(self, group1, group2): """Returns statistic analysis of two groups""" descriptive_table, result_table = researchpy.ttest(group1, group2) descriptive_table = descriptive_table.rename(index={0: 'ApoE3', 1: 'ApoE4', 2: 'ApoE3 + ApoE4'}) return descriptive_table, result_table
import pandas as pd import researchpy as rp import scipy.stats as stats df = pd.read_csv("interfaceResultsT-Test.csv") df.info() summary, results = rp.ttest(group1= df['time_after'][df['sex'] == 'Male'], group1_name= "Male", group2= df['time_after'][df['sex'] == 'Female'], group2_name= "Female") print(summary) print(results)