def bootstrap(dataset: Dataset, combined_data: CombinedData): calculations = {} xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() for y in ys: # for now assert (len(ys) == 1) # Main effects for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select( y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) stat = bs.bootstrap(cat_data.to_numpy(), stat_func=bs_stats.median) calculations[c] = stat # import pdb; pdb.set_trace() # store all the medians & confidence intervals # return all the medians & CIs # data.append(cat_data) return calculations
def greater_than_5_frequency(dataset: Dataset, var_data: CombinedData, alpha): xs = var_data.get_explanatory_variables() ys = var_data.get_explained_variables() if len(xs) == 1: if len(ys) == 1: x = xs[0] y = ys[0] if x.is_categorical() and y.is_categorical(): # Get the count for each category x_cat = [k for k,v in x.metadata[categories].items()] y_cat = [k for k,v in y.metadata[categories].items()] for xc in x_cat: for yc in y_cat: data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"]) # Check that the count is at least five for each of the (x,y) group pairs if (len(data) < 5): return False return True else: return False else: raise ValueError(f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}") else: x0 = xs[0] x1 = xs[1] if x0.is_categorical() and x1.is_categorical(): # Get the count for each category x0_cat = [k for k,v in x0.metadata[categories].items()] x1_cat = [k for k,v in x1.metadata[categories].items()] for x0c in x0_cat: for x1c in x1_cat: data = dataset.select(x1.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{x1.metadata[name]} == '{x1c}'"]) # Check that the count is at least five for each of the (x,x1) group pairs if (len(data) < 5): return False return True else: return False
def chi_square(dataset: Dataset, combined_data: CombinedData): # Compute the contingency table xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() if len(xs) == 1: if len(ys) == 1: x = xs[0] y = ys[0] # Get the count for each category x_cat = [k for k, v in x.metadata[categories].items()] y_cat = [k for k, v in y.metadata[categories].items()] contingency_table = [] contingency_table_key = [ ] # labels for the order in which data is stored in data array (define above) for xc in x_cat: table_row = [] table_row_key = [] for yc in y_cat: data = dataset.select(y.metadata[name], where=[ f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'" ]) table_row.append(len(data)) x_y_key = str( x.metadata[name]) + ':' + str(xc) + ' by ' + str( y.metadata[name]) + ':' + str(yc) table_row_key.append(x_y_key) assert (len(table_row_key) == len(table_row)) assert (len(table_row) == len(y_cat)) contingency_table.append(table_row) contingency_table_key.append(table_row_key) else: raise ValueError( f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}" ) else: raise ValueError( f"Currently, chi square requires/only supports 1 explanatory variable, instead received: {len(xs)} -- {xs}" ) # chi2, p, dof, ex = chi2_contingency(obs, correction=False) chi2, p, dof, ex = stats.chi2_contingency(contingency_table, correction=False) return ChisquareResult(chi2, p, dof, ex)
def wilcoxon_signed_rank(dataset: Dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.wilcoxon(data[0], data[1])
def friedman(dataset: Dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.friedmanchisquare(*data)
def pointbiserial(dataset: Dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.pointbiserialr(data[0], data[1])
def kruskall_wallis(dataset: Dataset, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: if x.metadata[categories] is None: import pdb pdb.set_trace() cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.kruskal(*data)
def fishers_exact(dataset: Dataset, combined_data: CombinedData): assert (len(combined_data.vars) == 2) # Compute the contingency table xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] # Get the count for each category x_cat = [k for k, v in x.metadata[categories].items()] y_cat = [k for k, v in y.metadata[categories].items()] contingency_table = [] contingency_table_key = [ ] # labels for the order in which data is stored in data array (define above) for xc in x_cat: table_row = [] table_row_key = [] for yc in y_cat: data = dataset.select(y.metadata[name], where=[ f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'" ]) table_row.append(len(data)) x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str( y.metadata[name]) + ':' + str(yc) table_row_key.append(x_y_key) assert (len(table_row_key) == len(table_row)) assert (len(table_row) == len(y_cat)) contingency_table.append(table_row) contingency_table_key.append(table_row_key) odds_ratio, p_value = stats.fisher_exact(contingency_table, alternative='two-sided') return FishersResult(odds_ratio, p_value)
def has_equal_variance(dataset: Dataset, var_data: CombinedData, alpha): xs = var_data.get_explanatory_variables() ys = var_data.get_explained_variables() cat_xs = [] cont_ys = [] grouped_data = [] for x in xs: if x.is_categorical(): cat_xs.append(x) for y in ys: if y.is_continuous(): cont_ys.append(y) eq_var = (None, None) if cat_xs and cont_ys: for y in ys: for x in xs: cat = [k for k,v in x.metadata[categories].items()] for c in cat: data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) grouped_data.append(data) if isinstance(var_data, BivariateData): # Equal variance eq_var = compute_eq_variance(grouped_data) # elif isinstance(var_data, MultivariateData): # var_data.properties[eq_variance + '::' + x.metadata[name] + ':' + y.metadata[name]] = compute_eq_variance(grouped_data) else: raise ValueError(f"var_data_data object is neither BivariateData nor MultivariateData: {type(var_data)}") if eq_var[0] is None and eq_var[1] is None: import pdb; pdb.set_trace() # raise Exception("did not compute variance, this is a bug") return False return (eq_var[1] > alpha)
def load_data_from_url(url: str, name: str): return Dataset.load(url, name)
def load_data(source_name: str, vars: list, pid: str): return Dataset(source_name, vars, pid)
def get_data(dataset: Dataset, var: VarData): return dataset.select(var.metadata[name], where=f"{var.metadata[query]}")
def evaluate(dataset: Dataset, expr: Node, assumptions: Dict[str, str], design: Dict[str, str] = None): if isinstance(expr, Variable): # dataframe = dataset[expr.name] # I don't know if we want this. We may want to just store query (in metadata?) and # then use query to get raw data later....(for user, not interpreter?) metadata = dataset.get_variable_data(expr.name) # (dtype, categories) # if expr.name == 'strategy': # import pdb; pdb.set_trace() metadata['var_name'] = expr.name metadata['query'] = '' return VarData(metadata) elif isinstance(expr, Literal): data = pd.Series( [expr.value] * len(dataset.data), index=dataset.data.index) # Series filled with literal value # metadata = None # metadata=None means literal metadata = dict() # metadata=None means literal metadata['var_name'] = '' # because not a var in the dataset metadata['query'] = '' metadata['value'] = expr.value return VarData(data, metadata) elif isinstance(expr, Equal): lhs = evaluate(dataset, expr.lhs) rhs = evaluate(dataset, expr.rhs) assert isinstance(lhs, VarData) assert isinstance(rhs, VarData) dataframe = lhs.dataframe[lhs.dataframe == rhs.dataframe] metadata = lhs.metadata if (isinstance(expr.rhs, Literal)): metadata[ 'query'] = f" == \'{rhs.metadata['value']}\'" # override lhs metadata for query elif (isinstance(expr.rhs, Variable)): metadata['query'] = f" == {rhs.metadata['var_name']}" else: raise ValueError(f"Not implemented for {rhs}") return VarData(metadata) elif isinstance(expr, NotEqual): rhs = evaluate(dataset, expr.rhs) lhs = evaluate(dataset, expr.lhs) assert isinstance(rhs, VarData) assert isinstance(lhs, VarData) dataframe = lhs.dataframe[lhs.dataframe != rhs.dataframe] metadata = lhs.metadata if (isinstance(expr.rhs, Literal)): metadata['query'] = " != \'\'" # override lhs metadata for query elif (isinstance(expr.rhs, Variable)): metadata['query'] = f" != {rhs.metadata['var_name']}" else: raise ValueError(f"Not implemented for {rhs}") return VarData(metadata) elif isinstance(expr, LessThan): lhs = evaluate(dataset, expr.lhs) rhs = evaluate(dataset, expr.rhs) assert isinstance(lhs, VarData) assert isinstance(rhs, VarData) dataframe = None metadata = rhs.metadata if (not lhs.metadata): raise ValueError( 'Malformed Relation. Filter on Variables must have variable as rhs' ) elif (lhs.metadata['dtype'] is DataType.NOMINAL): raise ValueError('Cannot compare nominal values with Less Than') elif (lhs.metadata['dtype'] is DataType.ORDINAL): # TODO May want to add a case should RHS and LHS both be variables # assert (rhs.metadata is None) comparison = rhs.dataframe.iloc[0] if (isinstance(comparison, str)): categories = lhs.metadata['categories'] # OrderedDict # Get raw Pandas Series indices for desired data ids = [ i for i, x in enumerate(lhs.dataframe) if categories[x] < categories[comparison] ] # Get Pandas Series set indices for desired data p_ids = [lhs.dataframe.index.values[i] for i in ids] # Create new Pandas Series with only the desired data, using set indices dataframe = pd.Series(lhs.dataframe, p_ids) dataframe.index.name = dataset.pid_col_name elif (np.issubdtype(comparison, np.integer)): categories = lhs.metadata['categories'] # OrderedDict # Get raw Pandas Series indices for desired data ids = [ i for i, x in enumerate(lhs.dataframe) if categories[x] < comparison ] # Get Pandas Series set indices for desired data p_ids = [lhs.dataframe.index.values[i] for i in ids] # Create new Pandas Series with only the desired data, using set indices dataframe = pd.Series(lhs.dataframe, p_ids) dataframe.index.name = dataset.pid_col_name else: raise ValueError( f"Cannot compare ORDINAL variables to {type(rhs.dataframe.iloc[0])}" ) elif (lhs.metadata['dtype'] is DataType.INTERVAL or lhs.metadata['dtype'] is DataType.RATIO): comparison = rhs.dataframe.iloc[0] # Get raw Pandas Series indices for desired data ids = [i for i, x in enumerate(lhs.dataframe) if x < comparison] # Get Pandas Series set indices for desired data p_ids = [lhs.dataframe.index.values[i] for i in ids] # Create new Pandas Series with only the desired data, using set indices dataframe = pd.Series(lhs.dataframe, p_ids) dataframe.index.name = dataset.pid_col_name else: raise Exception(f"Invalid Less Than Operation:{lhs} < {rhs}") if (isinstance(expr.rhs, Literal)): metadata['query'] = " < \'\'" # override lhs metadata for query elif (isinstance(expr.rhs, Variable)): metadata['query'] = f" < {rhs.metadata['var_name']}" else: raise ValueError(f"Not implemented for {rhs}") return VarData(metadata) elif isinstance(expr, LessThanEqual): lhs = evaluate(dataset, expr.lhs) rhs = evaluate(dataset, expr.rhs) assert isinstance(lhs, VarData) assert isinstance(rhs, VarData) dataframe = None metadata = rhs.metadata if (not lhs.metadata): raise ValueError( 'Malformed Relation. Filter on Variables must have variable as rhs' ) elif (lhs.metadata['dtype'] is DataType.NOMINAL): raise ValueError('Cannot compare nominal values with Less Than') elif (lhs.metadata['dtype'] is DataType.ORDINAL): # TODO May want to add a case should RHS and LHS both be variables # assert (rhs.metadata is None) comparison = rhs.dataframe.iloc[0] if (isinstance(comparison, str)): categories = lhs.metadata['categories'] # OrderedDict # Get raw Pandas Series indices for desired data ids = [ i for i, x in enumerate(lhs.dataframe) if categories[x] <= categories[comparison] ] # Get Pandas Series set indices for desired data p_ids = [lhs.dataframe.index.values[i] for i in ids] # Create new Pandas Series with only the desired data, using set indices dataframe = pd.Series(lhs.dataframe, p_ids) dataframe.index.name = dataset.pid_col_name elif (np.issubdtype(comparison, np.integer)): categories = lhs.metadata['categories'] # OrderedDict # Get raw Pandas Series indices for desired data ids = [ i for i, x in enumerate(lhs.dataframe) if categories[x] <= comparison ] # Get Pandas Series set indices for desired data p_ids = [lhs.dataframe.index.values[i] for i in ids] # Create new Pandas Series with only the desired data, using set indices dataframe = pd.Series(lhs.dataframe, p_ids) dataframe.index.name = dataset.pid_col_name else: raise ValueError( f"Cannot compare ORDINAL variables to {type(rhs.dataframe.iloc[0])}" ) elif (lhs.metadata['dtype'] is DataType.INTERVAL or lhs.metadata['dtype'] is DataType.RATIO): comparison = rhs.dataframe.iloc[0] # Get raw Pandas Series indices for desired data ids = [i for i, x in enumerate(lhs.dataframe) if x <= comparison] # Get Pandas Series set indices for desired data p_ids = [lhs.dataframe.index.values[i] for i in ids] # Create new Pandas Series with only the desired data, using set indices dataframe = pd.Series(lhs.dataframe, p_ids) dataframe.index.name = dataset.pid_col_name else: raise Exception( f"Invalid Less Than Equal Operation:{lhs} <= {rhs}") if (isinstance(expr.rhs, Literal)): metadata['query'] = " <= \'\'" # override lhs metadata for query elif (isinstance(expr.rhs, Variable)): metadata['query'] = f" <= {rhs.metadata['var_name']}" else: raise ValueError(f"Not implemented for {rhs}") return VarData(metadata) elif isinstance(expr, GreaterThan): lhs = evaluate(dataset, expr.lhs) rhs = evaluate(dataset, expr.rhs) assert isinstance(lhs, VarData) assert isinstance(rhs, VarData) dataframe = None metadata = rhs.metadata if (not lhs.metadata): raise ValueError( 'Malformed Relation. Filter on Variables must have variable as rhs' ) elif (lhs.metadata['dtype'] is DataType.NOMINAL): raise ValueError('Cannot compare nominal values with Greater Than') elif (lhs.metadata['dtype'] is DataType.ORDINAL): # TODO May want to add a case should RHS and LHS both be variables # assert (rhs.metadata is None) comparison = rhs.dataframe.iloc[0] if (isinstance(comparison, str)): categories = lhs.metadata['categories'] # OrderedDict # Get raw Pandas Series indices for desired data ids = [ i for i, x in enumerate(lhs.dataframe) if categories[x] > categories[comparison] ] # Get Pandas Series set indices for desired data p_ids = [lhs.dataframe.index.values[i] for i in ids] # Create new Pandas Series with only the desired data, using set indices dataframe = pd.Series(lhs.dataframe, p_ids) dataframe.index.name = dataset.pid_col_name elif (np.issubdtype(comparison, np.integer)): categories = lhs.metadata['categories'] # OrderedDict # Get raw Pandas Series indices for desired data ids = [ i for i, x in enumerate(lhs.dataframe) if categories[x] > comparison ] # Get Pandas Series set indices for desired data p_ids = [lhs.dataframe.index.values[i] for i in ids] # Create new Pandas Series with only the desired data, using set indices dataframe = pd.Series(lhs.dataframe, p_ids) dataframe.index.name = dataset.pid_col_name else: raise ValueError( f"Cannot compare ORDINAL variables to {type(rhs.dataframe.iloc[0])}" ) elif (lhs.metadata['dtype'] is DataType.INTERVAL or lhs.metadata['dtype'] is DataType.RATIO): comparison = rhs.dataframe.iloc[0] # Get raw Pandas Series indices for desired data ids = [i for i, x in enumerate(lhs.dataframe) if x > comparison] # Get Pandas Series set indices for desired data p_ids = [lhs.dataframe.index.values[i] for i in ids] # Create new Pandas Series with only the desired data, using set indices dataframe = pd.Series(lhs.dataframe, p_ids) dataframe.index.name = dataset.pid_col_name else: raise Exception(f"Invalid Greater Than Operation:{lhs} > {rhs}") if (isinstance(expr.rhs, Literal)): metadata['query'] = " > \'\'" # override lhs metadata for query elif (isinstance(expr.rhs, Variable)): metadata['query'] = f" > {rhs.metadata['var_name']}" else: raise ValueError(f"Not implemented for {rhs}") return VarData(metadata) elif isinstance(expr, GreaterThanEqual): lhs = evaluate(dataset, expr.lhs) rhs = evaluate(dataset, expr.rhs) assert isinstance(lhs, VarData) assert isinstance(rhs, VarData) dataframe = None metadata = rhs.metadata if (not lhs.metadata): raise ValueError( 'Malformed Relation. Filter on Variables must have variable as rhs' ) elif (lhs.metadata['dtype'] is DataType.NOMINAL): raise ValueError( 'Cannot compare nominal values with Greater Than Equal') elif (lhs.metadata['dtype'] is DataType.ORDINAL): # TODO May want to add a case should RHS and LHS both be variables # assert (rhs.metadata is None) comparison = rhs.dataframe.iloc[0] if (isinstance(comparison, str)): categories = lhs.metadata['categories'] # OrderedDict # Get raw Pandas Series indices for desired data ids = [ i for i, x in enumerate(lhs.dataframe) if categories[x] >= categories[comparison] ] # Get Pandas Series set indices for desired data p_ids = [lhs.dataframe.index.values[i] for i in ids] # Create new Pandas Series with only the desired data, using set indices dataframe = pd.Series(lhs.dataframe, p_ids) dataframe.index.name = dataset.pid_col_name elif (np.issubdtype(comparison, np.integer)): categories = lhs.metadata['categories'] # OrderedDict # Get raw Pandas Series indices for desired data ids = [ i for i, x in enumerate(lhs.dataframe) if categories[x] >= comparison ] # Get Pandas Series set indices for desired data p_ids = [lhs.dataframe.index.values[i] for i in ids] # Create new Pandas Series with only the desired data, using set indices dataframe = pd.Series(lhs.dataframe, p_ids) dataframe.index.name = dataset.pid_col_name else: raise ValueError( f"Cannot compare ORDINAL variables to {type(rhs.dataframe.iloc[0])}" ) elif (lhs.metadata['dtype'] is DataType.INTERVAL or lhs.metadata['dtype'] is DataType.RATIO): comparison = rhs.dataframe.iloc[0] # Get raw Pandas Series indices for desired data ids = [i for i, x in enumerate(lhs.dataframe) if x >= comparison] # Get Pandas Series set indices for desired data p_ids = [lhs.dataframe.index.values[i] for i in ids] # Create new Pandas Series with only the desired data, using set indices dataframe = pd.Series(lhs.dataframe, p_ids) dataframe.index.name = dataset.pid_col_name else: raise Exception( f"Invalid Greater Than Equal Operation:{lhs} >= {rhs}") if (isinstance(expr.rhs, Literal)): metadata['query'] = " >= \'\'" # override lhs metadata for query elif (isinstance(expr.rhs, Variable)): metadata['query'] = f" >= {rhs.metadata['var_name']}" else: raise ValueError(f"Not implemented for {rhs}") return VarData(metadata) elif isinstance(expr, Relate): vars = [] for v in expr.vars: eval_v = evaluate(dataset, v, design) if not eval_v: raise ValueError( "The variables you are referencing are not defined as variables in your list of variables." ) assert isinstance(eval_v, VarData) vars.append(eval_v) # What kind of study are we analyzing? study_type = determine_study_type(vars, design) # Assign roles to variables we are analyzing vars = assign_roles(vars, study_type, design) # Compute individual variable properties # vars = compute_data_properties(dap1ataset, vars) combined_data = None # Do we have a Bivariate analysis? if len(vars) == 2: combined_data = BivariateData(vars, study_type, alpha=float(assumptions['alpha'])) else: # Do we have a Multivariate analysis? combined_data = MultivariateData(vars, study_type, alpha=float(assumptions['alpha'])) # Add paired property add_paired_property(dataset, combined_data, study_type, design) # check sample sizes are identical # Compute between variable level properties # combined_data = compute_combined_data_properties(dataset, combined_data, study_type, design) # CEGIS-style synthesis # Synthesize tests tests = synthesize_tests(dataset, assumptions, combined_data) # import pdb; pdb.set_trace() """" # verify_properties(properties_and_tests) # get_tests # execute_tests # interpret_tests_results # print(tests) for test in tests: print("\nValid test: %s" % test.name) print("Properties:") properties = test.properties() for prop in properties: property_identifier = "" if prop.scope == "test": property_identifier = test.name + ": " + prop.name else: for var_indices in test.properties_for_vars[prop]: for var_index in var_indices: property_identifier += f"variable {test.test_vars[var_index].name} " property_identifier += ": %s" % prop.name print(property_identifier) """ # Execute and store results from each valid test results = {} if len(tests) == 0: tests.append('bootstrap') # Default to bootstrap for test in tests: test_result = execute_test(dataset, design, combined_data, test) results[test] = test_result if 'bootstrap' in tests: # import pdb; pdb.set_trace() pass # TODO: Calculate effect size for experiments!! # calculate_effect_size() # import pdb; pdb.set_trace() # TODO One-sided test? if expr.predictions: # import pdb; pdb.set_trace() # T-tests: may greater-than test when p/2 < alpha and t > 0, and of a less-than test when p/2 < alpha and t < 0 # --> may want to divide p-value in t-test (before return) # Add something to the results that they are one-sided vs two-sided tests? # TODO For f_test/... post hoc comparisons depending on predictions pass return ResultData(results) elif isinstance(expr, Mean): var = evaluate(dataset, expr.var) assert isinstance(var, VarData) # bs.bootstrap(var.dataframe, stat_func= # bs_stats.mean) raise Exception('Not implemented Mean')