def rm_one_way_anova(dataset: Dataset, design, combined_data: CombinedData): data = dataset.data xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] between_subjs = [] within_subjs = [] for x in xs: if "between subjects" in design and design[ "between subjects"] == x.metadata[name]: between_subjs.append(x.metadata[name]) if "within subjects" in design and design[ "within subjects"] == x.metadata[name]: within_subjs.append(x.metadata[name]) # import pdb; pdb.set_trace() id = dataset.pid_col_name aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=id, within=within_subjs) # aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=dataset.pid_col_name, within=within_subjs, between=between_subjs) # apparently not implemented in statsmodels # import pdb; pdb.set_trace() res2way = aovrm2way.fit()
def bootstrap(dataset: Dataset, combined_data: CombinedData): calculations = {} xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() for y in ys: # for now assert (len(ys) == 1) # Main effects for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select( y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) stat = bs.bootstrap(cat_data.to_numpy(), stat_func=bs_stats.median) calculations[c] = stat # import pdb; pdb.set_trace() # store all the medians & confidence intervals # return all the medians & CIs # data.append(cat_data) return calculations
def factorial_ANOVA(dataset: Dataset, combined_data: CombinedData): # Construct formula xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] formula = f"{y.metadata[name]} ~ " for i in range(len(xs)): x = xs[i] formula += f"C({x.metadata[name]})" if i < len(xs) - 1: formula += " + " # Add the interactions interactions = [] for i in range(len(xs)): x_i = xs[i] inter = f"C({x_i.metadata[name]})" for j in range(len(xs)): if i != j: x_j = xs[j] inter += " * " + f"C({x_j.metadata[name]})" interactions.append(inter) if _is_interaction_unique(interactions, inter): formula += " + " + inter ols_formula = ols(formula, data=dataset.data) model = ols_formula.fit() return sm.stats.anova_lm(model, type=2)
def chi_square(dataset: Dataset, combined_data: CombinedData): # Compute the contingency table xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() if len(xs) == 1: if len(ys) == 1: x = xs[0] y = ys[0] # Get the count for each category x_cat = [k for k, v in x.metadata[categories].items()] y_cat = [k for k, v in y.metadata[categories].items()] contingency_table = [] contingency_table_key = [ ] # labels for the order in which data is stored in data array (define above) for xc in x_cat: table_row = [] table_row_key = [] for yc in y_cat: data = dataset.select(y.metadata[name], where=[ f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'" ]) table_row.append(len(data)) x_y_key = str( x.metadata[name]) + ':' + str(xc) + ' by ' + str( y.metadata[name]) + ':' + str(yc) table_row_key.append(x_y_key) assert (len(table_row_key) == len(table_row)) assert (len(table_row) == len(y_cat)) contingency_table.append(table_row) contingency_table_key.append(table_row_key) else: raise ValueError( f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}" ) else: raise ValueError( f"Currently, chi square requires/only supports 1 explanatory variable, instead received: {len(xs)} -- {xs}" ) # chi2, p, dof, ex = chi2_contingency(obs, correction=False) chi2, p, dof, ex = stats.chi2_contingency(contingency_table, correction=False) return ChisquareResult(chi2, p, dof, ex)
def f_test(dataset: Dataset, combined_data: CombinedData): # Construct formula xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] formula = ols(f"{y.metadata[name]} ~ C({x.metadata[name]})", data=dataset.data) model = formula.fit() return sm.stats.anova_lm(model, type=2)
def wilcoxon_signed_rank(dataset: Dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.wilcoxon(data[0], data[1])
def paired_students_t(dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.ttest_rel(data[0], data[1])
def friedman(dataset: Dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.friedmanchisquare(*data)
def greater_than_5_frequency(dataset: Dataset, var_data: CombinedData, alpha): xs = var_data.get_explanatory_variables() ys = var_data.get_explained_variables() if len(xs) == 1: if len(ys) == 1: x = xs[0] y = ys[0] if x.is_categorical() and y.is_categorical(): # Get the count for each category x_cat = [k for k,v in x.metadata[categories].items()] y_cat = [k for k,v in y.metadata[categories].items()] for xc in x_cat: for yc in y_cat: data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"]) # Check that the count is at least five for each of the (x,y) group pairs if (len(data) < 5): return False return True else: return False else: raise ValueError(f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}") else: x0 = xs[0] x1 = xs[1] if x0.is_categorical() and x1.is_categorical(): # Get the count for each category x0_cat = [k for k,v in x0.metadata[categories].items()] x1_cat = [k for k,v in x1.metadata[categories].items()] for x0c in x0_cat: for x1c in x1_cat: data = dataset.select(x1.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{x1.metadata[name]} == '{x1c}'"]) # Check that the count is at least five for each of the (x,x1) group pairs if (len(data) < 5): return False return True else: return False
def pointbiserial(dataset: Dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.pointbiserialr(data[0], data[1])
def kruskall_wallis(dataset: Dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: if x.metadata[categories] is None: import pdb pdb.set_trace() cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.kruskal(*data)
def fishers_exact(dataset: Dataset, combined_data: CombinedData): assert (len(combined_data.vars) == 2) # Compute the contingency table xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] # Get the count for each category x_cat = [k for k, v in x.metadata[categories].items()] y_cat = [k for k, v in y.metadata[categories].items()] contingency_table = [] contingency_table_key = [ ] # labels for the order in which data is stored in data array (define above) for xc in x_cat: table_row = [] table_row_key = [] for yc in y_cat: data = dataset.select(y.metadata[name], where=[ f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'" ]) table_row.append(len(data)) x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str( y.metadata[name]) + ':' + str(yc) table_row_key.append(x_y_key) assert (len(table_row_key) == len(table_row)) assert (len(table_row) == len(y_cat)) contingency_table.append(table_row) contingency_table_key.append(table_row_key) odds_ratio, p_value = stats.fisher_exact(contingency_table, alternative='two-sided') return FishersResult(odds_ratio, p_value)
def has_equal_variance(dataset: Dataset, var_data: CombinedData, alpha): xs = var_data.get_explanatory_variables() ys = var_data.get_explained_variables() cat_xs = [] cont_ys = [] grouped_data = [] for x in xs: if x.is_categorical(): cat_xs.append(x) for y in ys: if y.is_continuous(): cont_ys.append(y) eq_var = (None, None) if cat_xs and cont_ys: for y in ys: for x in xs: cat = [k for k,v in x.metadata[categories].items()] for c in cat: data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) grouped_data.append(data) if isinstance(var_data, BivariateData): # Equal variance eq_var = compute_eq_variance(grouped_data) # elif isinstance(var_data, MultivariateData): # var_data.properties[eq_variance + '::' + x.metadata[name] + ':' + y.metadata[name]] = compute_eq_variance(grouped_data) else: raise ValueError(f"var_data_data object is neither BivariateData nor MultivariateData: {type(var_data)}") if eq_var[0] is None and eq_var[1] is None: import pdb; pdb.set_trace() # raise Exception("did not compute variance, this is a bug") return False return (eq_var[1] > alpha)
def has_one_x(dataset: Dataset, var_data: CombinedData, alpha): xs = var_data.get_explanatory_variables() return len(xs) == 1