def vda(dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] pred = None if predictions: pred = predictions[0][0] lhs = None rhs = None for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) if c == pred.lhs.value: lhs = cat_data if c == pred.rhs.value: rhs = cat_data data.append(cat_data) m = len(lhs) n = len(rhs) concat = lhs.append(rhs) r = stats.rankdata(concat) r1 = sum(r[range(0, m)]) # Compute the measure # A = (r1/m - (m+1)/2)/n # formula (14) in Vargha and Delaney, 2000 A = (2 * r1 - m * (m + 1)) / (2 * n * m) # equivalent formula to avoid accuracy errors return A
def __init__(self, test_to_results, combined_data: CombinedData): self.test_to_results = test_to_results self.test_to_assumptions = {} for test in __ALL_TESTS__: if test.name in test_to_results: test_assumptions = [] # TODO: The names get stale if hypothesize() is called multiple times in a row. for applied_prop in test._properties: assumption = f"{applied_prop.property.description}: " if applied_prop.property.name == "has_one_x": assumption += combined_data.get_explanatory_variables( )[0].metadata[name] elif applied_prop.property.name == "has_one_y": assumption += combined_data.get_explained_variables( )[0].metadata[name] elif applied_prop.property.name == "has_independent_observations" or applied_prop.property.name == "has_paired_observations": assumption += combined_data.get_explanatory_variables( )[0].metadata[name] else: for stat_var in applied_prop.vars: assumption += f"{stat_var.name}, " assumption = assumption[:-2] if applied_prop.property_test_results is not None: assumption += f": {applied_prop.property_test_results}" test_assumptions.append(assumption) self.test_to_assumptions[test.name] = test_assumptions
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None t_stat, p_val = stats.pointbiserialr(data[0], data[1]) dof = None test_result = TestResult(name=pointbiserial_name, test_statistic=t_stat, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha) return test_result
def friedman(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) # return stats.friedmanchisquare(*data) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None test_statistic, p_val = stats.friedmanchisquare(*data) dof = len(data[0]) # TODO This might not be correct test_result = TestResult(name="Kruskall Wallis Test", test_statistic=test_statistic, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha) return test_result
def paired_students_t(dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) t_stat, p_val = stats.ttest_rel(data[0], data[1]) dof = (len(data[0]) + len(data[1])) / 2. - 1 # (Group1 + Group2)/2 - 1 test_result = TestResult(name=paired_students_name, test_statistic=t_stat, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha, x=x, y=y) return test_result
def add_paired_property( dataset, combined_data: CombinedData, study_type: str, design: Dict[str, str] = None): # check same sizes are identical global paired x = None y = None combined_data.properties[paired] = False if isinstance(combined_data, BivariateData): if study_type == experiment_identifier: # Just need one variable to be Categorical and another to be Continuous (regardless of role) x = combined_data.get_vars(iv_identifier) y = combined_data.get_vars(dv_identifier) else: # study_type == observational_identifier x = combined_data.get_vars(contributor_identifier) y = combined_data.get_vars(outcome_identifier) if x and y: assert (len(x) == len(y) == 1) x = x[0] y = y[0] if x.is_categorical() and y.is_continuous(): if within_subj in design and design[within_subj] == x.metadata[ name]: combined_data.properties[paired] = True
def cohens(dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] pred = None if predictions: pred = predictions[0][0] lhs = None rhs = None for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) if c == pred.lhs.value: lhs = cat_data if c == pred.rhs.value: rhs = cat_data data.append(cat_data) cohens_d = (mean(lhs) - mean(rhs)) / (sqrt( (stdev(lhs)**2 + stdev(rhs)**2) / 2)) return cohens_d
def wilcoxon_signed_rank(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None t_stat, p_val = stats.wilcoxon(data[0], data[1]) dof = len(data[0]) # TODO This might not be correct test_result = TestResult(name=wilcoxon_signed_rank_name, test_statistic=t_stat, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha, x=x, y=y) return test_result
def kruskall_wallis(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: if x.metadata[categories] is None: raise ValueError('') cat = [k for k,v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None t_stat, p_val = stats.kruskal(*data) dof = len(data[0]) # TODO This might not be correct test_result = TestResult( name = kruskall_wallis_name, test_statistic = t_stat, p_value = p_val, prediction = prediction, dof = dof, alpha = combined_data.alpha, x = xs[0], # TODO: Not sure if it's possible to have multiple x's? y = y) return test_result
def chi_square(dataset: Dataset, predictions, combined_data: CombinedData): # Compute the contingency table xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() if len(xs) == 1: if len(ys) == 1: x = xs[0] y = ys[0] # Get the count for each category x_cat = [k for k,v in x.metadata[categories].items()] y_cat = [k for k,v in y.metadata[categories].items()] contingency_table = [] contingency_table_key = [] # labels for the order in which data is stored in data array (define above) for xc in x_cat: table_row = [] table_row_key = [] for yc in y_cat: data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"]) table_row.append(len(data)) x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(y.metadata[name]) + ':' + str(yc) table_row_key.append(x_y_key) assert(len(table_row_key) == len(table_row)) assert(len(table_row) == len(y_cat)) contingency_table.append(table_row) contingency_table_key.append(table_row_key) else: raise ValueError(f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}") else: raise ValueError(f"Currently, chi square requires/only supports 1 explanatory variable, instead received: {len(xs)} -- {xs}") # chi2, p, dof, ex = chi2_contingency(obs, correction=False) # chi2, p, dof, ex = stats.chi2_contingency(contingency_table, correction=False) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None test_statistic, p_val, dof, ex = stats.chi2_contingency(contingency_table, correction=False) dof = None test_result = TestResult( name = chi_square_name, test_statistic = test_statistic, p_value = p_val, prediction = prediction, dof = dof, alpha = combined_data.alpha, x = x, y = y) return test_result
def greater_than_5_frequency(dataset: Dataset, var_data: CombinedData, alpha): xs = var_data.get_explanatory_variables() ys = var_data.get_explained_variables() if len(xs) == 1: if len(ys) == 1: x = xs[0] y = ys[0] if x.is_categorical() and y.is_categorical(): # Get the count for each category x_cat = [k for k, v in x.metadata[categories].items()] y_cat = [k for k, v in y.metadata[categories].items()] for xc in x_cat: for yc in y_cat: data = dataset.select( y.metadata[name], where=[ f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'" ]) # Check that the count is at least five for each of the (x,y) group pairs if len(data) < 5: return False return True else: return False else: raise ValueError( f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}" ) else: x0 = xs[0] x1 = xs[1] if x0.is_categorical() and x1.is_categorical(): # Get the count for each category x0_cat = [k for k, v in x0.metadata[categories].items()] x1_cat = [k for k, v in x1.metadata[categories].items()] for x0c in x0_cat: for x1c in x1_cat: data = dataset.select(x1.metadata[name], where=[ f"{x.metadata[name]} == '{xc}'", f"{x1.metadata[name]} == '{x1c}'" ]) # Check that the count is at least five for each of the (x,x1) group pairs if len(data) < 5: return False return True else: return False
def fishers_exact(dataset: Dataset, predictions, combined_data: CombinedData): assert(len(combined_data.vars) == 2) # Compute the contingency table xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert(len(xs) == 1) assert(len(ys) == 1) x = xs[0] y = ys[0] # Get the count for each category x_cat = [k for k,v in x.metadata[categories].items()] y_cat = [k for k,v in y.metadata[categories].items()] contingency_table = [] contingency_table_key = [] # labels for the order in which data is stored in data array (define above) for xc in x_cat: table_row = [] table_row_key = [] for yc in y_cat: data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"]) table_row.append(len(data)) x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(y.metadata[name]) + ':' + str(yc) table_row_key.append(x_y_key) assert(len(table_row_key) == len(table_row)) assert(len(table_row) == len(y_cat)) contingency_table.append(table_row) contingency_table_key.append(table_row_key) # odds_ratio, p_value = stats.fisher_exact(contingency_table, alternative='two-sided') # return FishersResult(odds_ratio, p_value) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None odds_ratio, p_val = stats.fisher_exact(contingency_table, alternative='two-sided') dof = None test_result = TestResult( name = fisher_exact_name, test_statistic = odds_ratio, p_value = p_val, prediction = prediction, dof = dof, alpha = combined_data.alpha, x = x, y = y) return test_result
def rm_one_way_anova(dataset: Dataset, predictions, design, combined_data: CombinedData): data = dataset.data xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] between_subjs = [] within_subjs = [] for x in xs: if "between subjects" in design and design[ "between subjects"] == x.metadata[name]: between_subjs.append(x.metadata[name]) if "within subjects" in design and design[ "within subjects"] == x.metadata[name]: within_subjs.append(x.metadata[name]) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None key = dataset.pid_col_name aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=key, within=within_subjs, aggregate_func='mean') # aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=dataset.pid_col_name, within=within_subjs, between=between_subjs) # apparently not implemented in statsmodels res2way = aovrm2way.fit() result_df = res2way.anova_table col_name = x.metadata[name] for row_name in result_df.index: if row_name == col_name: row_data = result_df.loc[row_name] test_statistic = row_data['F Value'] p_val = row_data['Pr > F'] dof = (row_data['Num DF'], row_data['Den DF']) test_result = TestResult(name=rm_one_way_anova_name, test_statistic=test_statistic, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha, table=result_df, x=x, y=y) return test_result
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None if len(data[0]) == len( data[1] ): # Scipy requires that groups have equal sizes even though this is not technically a requirement of the Pointbiserial correlation corr, p_val = stats.pointbiserialr(data[0], data[1]) else: # Compute pointbiserial correlation on our own data_all = data[0].append(data[1]) group_0_mean = np.mean(data[0]) group_0_size = len(data[0]) group_1_mean = np.mean(data[1]) group_1_size = len(data[1]) sample_size = group_0_size + group_1_size assert (sample_size == len(data_all)) sample_std = stats.tstd(data_all) corr = (group_0_mean - group_1_mean) / sample_std * math.sqrt( (group_0_size * group_1_size) / (sample_size * (sample_size - 1))) t_stat, p_val = stats.ttest_ind(data[0], data[1], equal_var=True) dof = None test_result = TestResult(name=POINTBISERIAL_NAME, test_statistic=corr, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha) return test_result
def f_test(dataset: Dataset, predictions, combined_data: CombinedData): # Construct formula xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert(len(xs) == 1) assert(len(ys) == 1) x = xs[0] y = ys[0] formula = ols(f"{y.metadata[name]} ~ C({x.metadata[name]})", data=dataset.data) model =formula.fit() return sm.stats.anova_lm(model, type=2)
def wilcoxon_signed_rank(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() x = xs[0] y = ys[0] cat = [k for k,v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.wilcoxon(data[0], data[1])
def friedman(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: cat = [k for k,v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.friedmanchisquare(*data)
def add_eq_variance_property(dataset, combined_data: CombinedData, study_type: str): xs = None ys = None cat_xs = [] cont_ys = [] grouped_data = [] if study_type == experiment_identifier: # Just need one variable to be Catogrical and another to be Continuous (regardless of role) -- both could be variable_identifier types xs = combined_data.get_vars(iv_identifier) ys = combined_data.get_vars(dv_identifier) else: # study_type == observational_identifier xs = combined_data.get_vars(contributor_identifier) ys = combined_data.get_vars(outcome_identifier) for x in xs: if x.is_categorical(): cat_xs.append(x) for y in ys: if y.is_continuous(): cont_ys.append(y) combined_data.properties[eq_variance] = None if cat_xs and cont_ys: for y in ys: for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: data = dataset.select( y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) grouped_data.append(data) if isinstance(combined_data, BivariateData): # Equal variance eq_var = compute_eq_variance(grouped_data) combined_data.properties[eq_variance] = eq_var elif isinstance(combined_data, MultivariateData): combined_data.properties[ eq_variance + '::' + x.metadata[name] + ':' + y.metadata[name]] = compute_eq_variance(grouped_data) else: raise ValueError( f"combined_data_data object is neither BivariateData nor MultivariateData: {type(combined_data)}" )
def kruskall_wallis(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: if x.metadata[categories] is None: raise ValueError('') cat = [k for k,v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.kruskal(*data)
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert(len(xs) == 1) assert(len(ys) == 1) x = xs[0] y = ys[0] cat = [k for k,v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.pointbiserialr(data[0], data[1])
def add_categories_normal(dataset, combined_data: CombinedData, study_type: str, design: Dict[str, str] = None): global cat_distribution xs = None ys = None cat_xs = [] cont_ys = [] grouped_data = dict() if study_type == experiment_identifier: # Just need one variable to be Catogrical and another to be Continuous (regardless of role) -- both could be variable_identifier types xs = combined_data.get_vars(iv_identifier) ys = combined_data.get_vars(dv_identifier) else: # study_type == observational_identifier xs = combined_data.get_vars(contributor_identifier) ys = combined_data.get_vars(outcome_identifier) for x in xs: if x.is_categorical(): cat_xs.append(x) for y in ys: if y.is_continuous(): cont_ys.append(y) combined_data.properties[cat_distribution] = None if cat_xs and cont_ys: for y in ys: for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: data = dataset.select( y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) grouped_data_name = str(x.metadata[name] + ':' + c) grouped_data[grouped_data_name] = compute_distribution( data) combined_data.properties[cat_distribution] = dict() combined_data.properties[cat_distribution][ y.metadata[name] + '::' + x.metadata[name]] = grouped_data
def f_test(dataset: Dataset, predictions, combined_data: CombinedData): # Construct formula xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert(len(xs) == 1) assert(len(ys) == 1) x = xs[0] y = ys[0] formula = ols(f"{y.metadata[name]} ~ C({x.metadata[name]})", data=dataset.data) model =formula.fit() if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None result_df = sm.stats.anova_lm(model, type=2) # Need to inspect the result_df and return the appropriate test_statistic/p_value pair based on the prediction col_name = "C(" + x.metadata[name] + ")" for row_name in result_df.index: if row_name == col_name: row_data = result_df.loc[row_name] test_statistic = row_data['F'] p_val = row_data['PR(>F)'] dof = row_data['df'] test_result = TestResult( name = f_test_name, test_statistic = test_statistic, p_value = p_val, prediction = prediction, dof = dof, alpha = combined_data.alpha, table = result_df, x=x, y=y) return test_result
def has_equal_variance(dataset: Dataset, var_data: CombinedData, alpha): xs = var_data.get_explanatory_variables() ys = var_data.get_explained_variables() cat_xs = [] cont_ys = [] grouped_data = [] for x in xs: if x.is_categorical(): cat_xs.append(x) for y in ys: if y.is_continuous(): cont_ys.append(y) eq_var = (None, None) if cat_xs and cont_ys: for y in ys: for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: data = dataset.select( y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) grouped_data.append(data) if isinstance(var_data, BivariateData): # Equal variance eq_var = compute_eq_variance(grouped_data) # elif isinstance(var_data, MultivariateData): # var_data.properties[eq_variance + '::' + x.metadata[name] + ':' + y.metadata[name]] = compute_eq_variance(grouped_data) else: raise ValueError( f"var_data_data object is neither BivariateData nor MultivariateData: {type(var_data)}" ) if eq_var[0] is None and eq_var[1] is None: import pdb pdb.set_trace() # raise Exception("did not compute variance, this is a bug") return False return (eq_var[1] > alpha)
def rm_one_way_anova(dataset: Dataset, design, combined_data: CombinedData): data = dataset.data xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert(len(ys) == 1) y = ys[0] between_subjs = [] within_subjs = [] for x in xs: if "between subjects" in design and design["between subjects"] == x.metadata[name]: between_subjs.append(x.metadata[name]) if "within subjects" in design and design["within subjects"] == x.metadata[name]: within_subjs.append(x.metadata[name]) # import pdb; pdb.set_trace() key = dataset.pid_col_name # import pdb; pdb.set_trace() aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=key, within=within_subjs, aggregate_func='mean') # aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=dataset.pid_col_name, within=within_subjs, between=between_subjs) # apparently not implemented in statsmodels res2way = aovrm2way.fit() return res2way
def bootstrap(dataset: Dataset, predictions, combined_data: CombinedData): calculations = {} xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() for y in ys: # for now assert(len(ys) == 1) # Main effects for x in xs: cat = [k for k,v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) stat = bs.bootstrap(cat_data.to_numpy(), stat_func=bs_stats.median) calculations[c] = stat # import pdb; pdb.set_trace() # store all the medians & confidence intervals # return all the medians & CIs # data.append(cat_data) return calculations
def factorial_ANOVA(dataset: Dataset, predictions, combined_data: CombinedData): # Construct formula xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert(len(ys) == 1) y = ys[0] formula = f"{y.metadata[name]} ~ " for i in range(len(xs)): x = xs[i] formula += f"C({x.metadata[name]})" if i < len(xs) - 1: formula += " + " # Add the interactions interactions = [] for i in range(len(xs)): x_i = xs[i] inter = f"C({x_i.metadata[name]})" for j in range(len(xs)): if i != j: x_j = xs[j] inter += " * " + f"C({x_j.metadata[name]})" interactions.append(inter) if _is_interaction_unique(interactions, inter): formula += " + " + inter ols_formula = ols(formula, data=dataset.data) model = ols_formula.fit() return sm.stats.anova_lm(model, type=2)
def has_one_x(dataset: Dataset, var_data: CombinedData, alpha): xs = var_data.get_explanatory_variables() return len(xs) == 1
def factorial_ANOVA(dataset: Dataset, predictions, combined_data: CombinedData): # Construct formula xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] formula = f"{y.metadata[name]} ~ " for i in range(len(xs)): x = xs[i] formula += f"C({x.metadata[name]})" if i < len(xs) - 1: formula += " + " # Add the interactions interactions = [] for i in range(len(xs)): x_i = xs[i] inter = f"C({x_i.metadata[name]})" for j in range(len(xs)): if i != j: x_j = xs[j] inter += " * " + f"C({x_j.metadata[name]})" interactions.append(inter) if _is_interaction_unique(interactions, inter): formula += " + " + inter ols_formula = ols(formula, data=dataset.data) model = ols_formula.fit() result_df = sm.stats.anova_lm(model, type=2) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None col_name = "C(" + x.metadata[name] + ")" for row_name in result_df.index: if row_name == col_name: row_data = result_df.loc[row_name] test_statistic = row_data['F'] p_val = row_data['PR(>F)'] dof = row_data['df'] test_result = TestResult(name=factorial_anova_name, test_statistic=test_statistic, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha, table=result_df, y=y, x=xs[0]) return test_result
def has_one_y(dataset: Dataset, var_data: CombinedData, alpha): ys = var_data.get_explained_variables() return len(ys) == 1
def bootstrap(dataset: Dataset, predictions, combined_data: CombinedData): calculations = {} xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() for y in ys: # for now assert (len(ys) == 1) # Main effects for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select( y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) stat = bs.bootstrap(cat_data.to_numpy(), stat_func=bs_stats.median) calculations[c] = stat if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None x = xs[0] # We should do this for the prediction, only....? cat = [k for k, v in x.metadata[categories].items()] test_statistic = {} p_val = None for c in cat: # import pdb; pdb.set_trace() lb = calculations[c].lower_bound ub = calculations[c].upper_bound test_statistic[c] = (lb, ub) alpha = combined_data.alpha lb = None ub = None for group, bounds in test_statistic.items(): if not lb: assert (not ub) lb = bounds[0] ub = bounds[1] else: if bounds[0] >= lb and bounds[0] <= ub: p_val = f'Greater than or equal to {alpha}' elif bounds[1] >= lb and bounds[1] <= ub: p_val = f'Greater than or equal to {alpha}' else: p_val = f'Less than {alpha}' dof = None test_result = TestResult(name="Bootstrap", test_statistic=test_statistic, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha, table=calculations) return test_result