def bootstrap(dataset: Dataset, combined_data: CombinedData): calculations = {} xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() for y in ys: # for now assert (len(ys) == 1) # Main effects for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select( y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) stat = bs.bootstrap(cat_data.to_numpy(), stat_func=bs_stats.median) calculations[c] = stat # import pdb; pdb.set_trace() # store all the medians & confidence intervals # return all the medians & CIs # data.append(cat_data) return calculations
def rm_one_way_anova(dataset: Dataset, design, combined_data: CombinedData): data = dataset.data xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] between_subjs = [] within_subjs = [] for x in xs: if "between subjects" in design and design[ "between subjects"] == x.metadata[name]: between_subjs.append(x.metadata[name]) if "within subjects" in design and design[ "within subjects"] == x.metadata[name]: within_subjs.append(x.metadata[name]) # import pdb; pdb.set_trace() id = dataset.pid_col_name aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=id, within=within_subjs) # aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=dataset.pid_col_name, within=within_subjs, between=between_subjs) # apparently not implemented in statsmodels # import pdb; pdb.set_trace() res2way = aovrm2way.fit()
def add_paired_property( dataset, combined_data: CombinedData, study_type: str, design: Dict[str, str] = None): # check same sizes are identical global paired x = None y = None combined_data.properties[paired] = False if isinstance(combined_data, BivariateData): if study_type == experiment_identifier: # Just need one variable to be Categorical and another to be Continuous (regardless of role) x = combined_data.get_vars(iv_identifier) y = combined_data.get_vars(dv_identifier) else: # study_type == observational_identifier x = combined_data.get_vars(contributor_identifier) y = combined_data.get_vars(outcome_identifier) if x and y: assert (len(x) == len(y) == 1) x = x[0] y = y[0] if x.is_categorical() and y.is_continuous(): if within_subj in design and design[within_subj] == x.metadata[ name]: combined_data.properties[paired] = True
def factorial_ANOVA(dataset: Dataset, combined_data: CombinedData): # Construct formula xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] formula = f"{y.metadata[name]} ~ " for i in range(len(xs)): x = xs[i] formula += f"C({x.metadata[name]})" if i < len(xs) - 1: formula += " + " # Add the interactions interactions = [] for i in range(len(xs)): x_i = xs[i] inter = f"C({x_i.metadata[name]})" for j in range(len(xs)): if i != j: x_j = xs[j] inter += " * " + f"C({x_j.metadata[name]})" interactions.append(inter) if _is_interaction_unique(interactions, inter): formula += " + " + inter ols_formula = ols(formula, data=dataset.data) model = ols_formula.fit() return sm.stats.anova_lm(model, type=2)
def chi_square(dataset: Dataset, combined_data: CombinedData): # Compute the contingency table xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() if len(xs) == 1: if len(ys) == 1: x = xs[0] y = ys[0] # Get the count for each category x_cat = [k for k, v in x.metadata[categories].items()] y_cat = [k for k, v in y.metadata[categories].items()] contingency_table = [] contingency_table_key = [ ] # labels for the order in which data is stored in data array (define above) for xc in x_cat: table_row = [] table_row_key = [] for yc in y_cat: data = dataset.select(y.metadata[name], where=[ f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'" ]) table_row.append(len(data)) x_y_key = str( x.metadata[name]) + ':' + str(xc) + ' by ' + str( y.metadata[name]) + ':' + str(yc) table_row_key.append(x_y_key) assert (len(table_row_key) == len(table_row)) assert (len(table_row) == len(y_cat)) contingency_table.append(table_row) contingency_table_key.append(table_row_key) else: raise ValueError( f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}" ) else: raise ValueError( f"Currently, chi square requires/only supports 1 explanatory variable, instead received: {len(xs)} -- {xs}" ) # chi2, p, dof, ex = chi2_contingency(obs, correction=False) chi2, p, dof, ex = stats.chi2_contingency(contingency_table, correction=False) return ChisquareResult(chi2, p, dof, ex)
def wilcoxon_signed_rank(dataset: Dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.wilcoxon(data[0], data[1])
def paired_students_t(dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.ttest_rel(data[0], data[1])
def f_test(dataset: Dataset, combined_data: CombinedData): # Construct formula xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] formula = ols(f"{y.metadata[name]} ~ C({x.metadata[name]})", data=dataset.data) model = formula.fit() return sm.stats.anova_lm(model, type=2)
def friedman(dataset: Dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.friedmanchisquare(*data)
def add_eq_variance_property(dataset, combined_data: CombinedData, study_type: str): xs = None ys = None cat_xs = [] cont_ys = [] grouped_data = [] if study_type == experiment_identifier: # Just need one variable to be Catogrical and another to be Continuous (regardless of role) -- both could be variable_identifier types xs = combined_data.get_vars(iv_identifier) ys = combined_data.get_vars(dv_identifier) else: # study_type == observational_identifier xs = combined_data.get_vars(contributor_identifier) ys = combined_data.get_vars(outcome_identifier) for x in xs: if x.is_categorical(): cat_xs.append(x) for y in ys: if y.is_continuous(): cont_ys.append(y) combined_data.properties[eq_variance] = None if cat_xs and cont_ys: for y in ys: for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: data = dataset.select( y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) grouped_data.append(data) if isinstance(combined_data, BivariateData): # Equal variance eq_var = compute_eq_variance(grouped_data) combined_data.properties[eq_variance] = eq_var elif isinstance(combined_data, MultivariateData): combined_data.properties[ eq_variance + '::' + x.metadata[name] + ':' + y.metadata[name]] = compute_eq_variance(grouped_data) else: raise ValueError( f"combined_data_data object is neither BivariateData nor MultivariateData: {type(combined_data)}" )
def greater_than_5_frequency(dataset: Dataset, var_data: CombinedData, alpha): xs = var_data.get_explanatory_variables() ys = var_data.get_explained_variables() if len(xs) == 1: if len(ys) == 1: x = xs[0] y = ys[0] if x.is_categorical() and y.is_categorical(): # Get the count for each category x_cat = [k for k,v in x.metadata[categories].items()] y_cat = [k for k,v in y.metadata[categories].items()] for xc in x_cat: for yc in y_cat: data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"]) # Check that the count is at least five for each of the (x,y) group pairs if (len(data) < 5): return False return True else: return False else: raise ValueError(f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}") else: x0 = xs[0] x1 = xs[1] if x0.is_categorical() and x1.is_categorical(): # Get the count for each category x0_cat = [k for k,v in x0.metadata[categories].items()] x1_cat = [k for k,v in x1.metadata[categories].items()] for x0c in x0_cat: for x1c in x1_cat: data = dataset.select(x1.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{x1.metadata[name]} == '{x1c}'"]) # Check that the count is at least five for each of the (x,x1) group pairs if (len(data) < 5): return False return True else: return False
def pointbiserial(dataset: Dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.pointbiserialr(data[0], data[1])
def add_categories_normal(dataset, combined_data: CombinedData, study_type: str, design: Dict[str, str] = None): global cat_distribution xs = None ys = None cat_xs = [] cont_ys = [] grouped_data = dict() if study_type == experiment_identifier: # Just need one variable to be Catogrical and another to be Continuous (regardless of role) -- both could be variable_identifier types xs = combined_data.get_vars(iv_identifier) ys = combined_data.get_vars(dv_identifier) else: # study_type == observational_identifier xs = combined_data.get_vars(contributor_identifier) ys = combined_data.get_vars(outcome_identifier) for x in xs: if x.is_categorical(): cat_xs.append(x) for y in ys: if y.is_continuous(): cont_ys.append(y) combined_data.properties[cat_distribution] = None if cat_xs and cont_ys: for y in ys: for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: data = dataset.select( y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) grouped_data_name = str(x.metadata[name] + ':' + c) grouped_data[grouped_data_name] = compute_distribution( data) combined_data.properties[cat_distribution] = dict() combined_data.properties[cat_distribution][ y.metadata[name] + '::' + x.metadata[name]] = grouped_data
def kruskall_wallis(dataset: Dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: if x.metadata[categories] is None: import pdb pdb.set_trace() cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.kruskal(*data)
def fishers_exact(dataset: Dataset, combined_data: CombinedData): assert (len(combined_data.vars) == 2) # Compute the contingency table xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] # Get the count for each category x_cat = [k for k, v in x.metadata[categories].items()] y_cat = [k for k, v in y.metadata[categories].items()] contingency_table = [] contingency_table_key = [ ] # labels for the order in which data is stored in data array (define above) for xc in x_cat: table_row = [] table_row_key = [] for yc in y_cat: data = dataset.select(y.metadata[name], where=[ f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'" ]) table_row.append(len(data)) x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str( y.metadata[name]) + ':' + str(yc) table_row_key.append(x_y_key) assert (len(table_row_key) == len(table_row)) assert (len(table_row) == len(y_cat)) contingency_table.append(table_row) contingency_table_key.append(table_row_key) odds_ratio, p_value = stats.fisher_exact(contingency_table, alternative='two-sided') return FishersResult(odds_ratio, p_value)
def has_equal_variance(dataset: Dataset, var_data: CombinedData, alpha): xs = var_data.get_explanatory_variables() ys = var_data.get_explained_variables() cat_xs = [] cont_ys = [] grouped_data = [] for x in xs: if x.is_categorical(): cat_xs.append(x) for y in ys: if y.is_continuous(): cont_ys.append(y) eq_var = (None, None) if cat_xs and cont_ys: for y in ys: for x in xs: cat = [k for k,v in x.metadata[categories].items()] for c in cat: data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) grouped_data.append(data) if isinstance(var_data, BivariateData): # Equal variance eq_var = compute_eq_variance(grouped_data) # elif isinstance(var_data, MultivariateData): # var_data.properties[eq_variance + '::' + x.metadata[name] + ':' + y.metadata[name]] = compute_eq_variance(grouped_data) else: raise ValueError(f"var_data_data object is neither BivariateData nor MultivariateData: {type(var_data)}") if eq_var[0] is None and eq_var[1] is None: import pdb; pdb.set_trace() # raise Exception("did not compute variance, this is a bug") return False return (eq_var[1] > alpha)
def has_one_y(dataset: Dataset, var_data: CombinedData, alpha): ys = var_data.get_explained_variables() return len(ys) == 1
def has_one_x(dataset: Dataset, var_data: CombinedData, alpha): xs = var_data.get_explanatory_variables() return len(xs) == 1
def synthesize_tests(dataset: Dataset, assumptions: Dict[str,str], combined_data: CombinedData): construct_all_tests(combined_data) global name stat_var_map = {} # Reorder variables so that y var is at the end combined_data._update_vars() # Compute unique statisical variable names from the combined data. combined_data_vars = [] for v in combined_data.vars: var = StatVar(v.metadata[name]) stat_var_map[v.metadata[name]] = var combined_data_vars.append(var) # Assume properties are True based on user assumptions solver = z3.Solver() # s = Tactic('qflia').solver() assumed_props = assume_properties(stat_var_map, assumptions, solver) # Update the arity of test-level properties for prop in test_props: prop._update(len(combined_data.vars)) # print(combined_data) # Apply all tests to the variables we are considering now in combined_data for test in all_tests(): test.apply(*combined_data_vars) solver.push() # Create backtracking point model = None # Store model # For each test, add it to the solver as a constraint. # Add the tests and their properties for test in all_tests(): log(f"\nCurrently considering {test.name}") solver.add(test.__z3__ == z3.And(*test.query())) solver.add(test.__z3__ == z3.BoolVal(True)) # Check the model result = solver.check() if result == z3.unsat: # import pdb; pdb.set_trace() log("Test is unsat.\n") # print("no more solutions") # print(solver.num_scopes()) solver.pop() # model = solver.model() # may need to do a check before call model elif result == z3.unknown: print("failed to solve") try: # print(solver.model()) pass except z3.Z3Exception: return else: model = solver.model() test_invalid = False # Does the test apply? # Would this ever be false?? if model and z3.is_true(model.evaluate(test.__z3__)): # Verify the properties for that test for prop in test._properties: log(f"Testing assumption: {prop._name}.") need_to_verify = True # If the prop was assumed by the user, skip verification. for ap in assumed_props: if prop == ap: log(f"Property was a user assumption. ") prop.property_test_results = "Assumed true." need_to_verify = False if need_to_verify: # Does this property need to hold for the test to be valid? # If so, verify that the property does hold if model and z3.is_true(model.evaluate(prop.__z3__)): val = verify_prop(dataset, combined_data, prop) if val: log(f"Property holds.") if not val: # if test.name == 'f_test': # import pdb; pdb.set_trace() if not test_invalid: # The property does not check log(f"Property FAILS") solver.pop() # remove the last test test_invalid = True model = None else: # test is already invalid. Going here just for completeness of logging log(f"EVER GET HERE?") solver.add(prop.__z3__ == z3.BoolVal(val)) solver.push() # Push latest state as backtracking point solver.check() model = solver.model() # final model # import pdb; pdb.set_trace() tests_to_conduct = [] # Could add all the test props first # Then add all the tests for test in all_tests(): if model and z3.is_true(model.evaluate(test.__z3__)): tests_to_conduct.append(test.name) elif not model: # No test applies pass reset_all_tests() # import pdb; pdb.set_trace() return tests_to_conduct
properties=categorical_properties, role=iv_identifier) nominal_dependent = VarData(metadata=nominal_metadata, properties=categorical_properties, role=dv_identifier) ordinal_dependent = VarData(metadata=ordinal_metadata, properties=categorical_properties, role=dv_identifier) ordinal_var = VarData(metadata=ordinal_metadata, properties=categorical_properties, role=null_identifier) nominal_var = VarData(metadata=nominal_metadata, properties=categorical_properties, role=null_identifier) cont_not_specified = CombinedData( vars=[normal_continuous_var, normal_continuous_var]) cont_diff_sample_size = CombinedData( vars=[normal_continuous_var, normal_continuous_large_sample]) cont_iv_dv = CombinedData( vars=[normal_continuous_dependent, normal_continuous_independent]) nominal_iv_cont_dv = CombinedData( vars=[normal_continuous_dependent, nominal_independent]) ordinal_iv_cont_dv = CombinedData( vars=[normal_continuous_dependent, ordinal_independent]) cont_iv_nominal_dv = CombinedData( vars=[nominal_dependent, normal_continuous_independent]) cont_iv_ordinal_dv = CombinedData( vars=[ordinal_dependent, normal_continuous_independent]) ordinal_not_specified = CombinedData(vars=[ordinal_var, ordinal_var]) nominal_not_specified = CombinedData(vars=[nominal_var, nominal_var]) ordinal_nominal_not_specified = CombinedData(vars=[ordinal_var, nominal_var])