Esempio n. 1
0
def bootstrap(dataset: Dataset, combined_data: CombinedData):
    calculations = {}

    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    for y in ys:
        # for now
        assert (len(ys) == 1)

        # Main effects
        for x in xs:
            cat = [k for k, v in x.metadata[categories].items()]
            for c in cat:
                cat_data = dataset.select(
                    y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
                stat = bs.bootstrap(cat_data.to_numpy(),
                                    stat_func=bs_stats.median)
                calculations[c] = stat
                # import pdb; pdb.set_trace()
                # store all the medians & confidence intervals
                # return all the medians & CIs
                # data.append(cat_data)

    return calculations
Esempio n. 2
0
def rm_one_way_anova(dataset: Dataset, design, combined_data: CombinedData):
    data = dataset.data
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(ys) == 1)
    y = ys[0]
    between_subjs = []
    within_subjs = []
    for x in xs:
        if "between subjects" in design and design[
                "between subjects"] == x.metadata[name]:
            between_subjs.append(x.metadata[name])
        if "within subjects" in design and design[
                "within subjects"] == x.metadata[name]:
            within_subjs.append(x.metadata[name])

    # import pdb; pdb.set_trace()
    id = dataset.pid_col_name
    aovrm2way = AnovaRM(data,
                        depvar=y.metadata[name],
                        subject=id,
                        within=within_subjs)
    # aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=dataset.pid_col_name, within=within_subjs, between=between_subjs) # apparently not implemented in statsmodels
    # import pdb; pdb.set_trace()
    res2way = aovrm2way.fit()
Esempio n. 3
0
def add_paired_property(
        dataset,
        combined_data: CombinedData,
        study_type: str,
        design: Dict[str, str] = None):  # check same sizes are identical
    global paired

    x = None
    y = None
    combined_data.properties[paired] = False
    if isinstance(combined_data, BivariateData):
        if study_type == experiment_identifier:
            # Just need one variable to be Categorical and another to be Continuous (regardless of role)
            x = combined_data.get_vars(iv_identifier)
            y = combined_data.get_vars(dv_identifier)

        else:  # study_type == observational_identifier
            x = combined_data.get_vars(contributor_identifier)
            y = combined_data.get_vars(outcome_identifier)

        if x and y:
            assert (len(x) == len(y) == 1)
            x = x[0]
            y = y[0]

            if x.is_categorical() and y.is_continuous():
                if within_subj in design and design[within_subj] == x.metadata[
                        name]:
                    combined_data.properties[paired] = True
Esempio n. 4
0
def factorial_ANOVA(dataset: Dataset, combined_data: CombinedData):

    # Construct formula
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)

    y = ys[0]

    formula = f"{y.metadata[name]} ~ "

    for i in range(len(xs)):
        x = xs[i]
        formula += f"C({x.metadata[name]})"

        if i < len(xs) - 1:
            formula += " + "

    # Add the interactions
    interactions = []
    for i in range(len(xs)):
        x_i = xs[i]
        inter = f"C({x_i.metadata[name]})"
        for j in range(len(xs)):
            if i != j:
                x_j = xs[j]
                inter += " * " + f"C({x_j.metadata[name]})"
                interactions.append(inter)

                if _is_interaction_unique(interactions, inter):
                    formula += " + " + inter

    ols_formula = ols(formula, data=dataset.data)
    model = ols_formula.fit()
    return sm.stats.anova_lm(model, type=2)
Esempio n. 5
0
def chi_square(dataset: Dataset, combined_data: CombinedData):
    # Compute the contingency table
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    if len(xs) == 1:
        if len(ys) == 1:
            x = xs[0]
            y = ys[0]

            # Get the count for each category
            x_cat = [k for k, v in x.metadata[categories].items()]
            y_cat = [k for k, v in y.metadata[categories].items()]

            contingency_table = []
            contingency_table_key = [
            ]  # labels for the order in which data is stored in data array (define above)

            for xc in x_cat:
                table_row = []
                table_row_key = []
                for yc in y_cat:
                    data = dataset.select(y.metadata[name],
                                          where=[
                                              f"{x.metadata[name]} == '{xc}'",
                                              f"{y.metadata[name]} == '{yc}'"
                                          ])
                    table_row.append(len(data))

                    x_y_key = str(
                        x.metadata[name]) + ':' + str(xc) + ' by ' + str(
                            y.metadata[name]) + ':' + str(yc)
                    table_row_key.append(x_y_key)

                assert (len(table_row_key) == len(table_row))
                assert (len(table_row) == len(y_cat))
                contingency_table.append(table_row)
                contingency_table_key.append(table_row_key)

        else:
            raise ValueError(
                f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}"
            )
    else:
        raise ValueError(
            f"Currently, chi square requires/only supports 1 explanatory variable, instead received: {len(xs)} -- {xs}"
        )

    # chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    chi2, p, dof, ex = stats.chi2_contingency(contingency_table,
                                              correction=False)
    return ChisquareResult(chi2, p, dof, ex)
Esempio n. 6
0
def wilcoxon_signed_rank(dataset: Dataset, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    return stats.wilcoxon(data[0], data[1])
Esempio n. 7
0
def paired_students_t(dataset, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    return stats.ttest_rel(data[0], data[1])
Esempio n. 8
0
def f_test(dataset: Dataset, combined_data: CombinedData):
    # Construct formula
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(xs) == 1)
    assert (len(ys) == 1)

    x = xs[0]
    y = ys[0]

    formula = ols(f"{y.metadata[name]} ~ C({x.metadata[name]})",
                  data=dataset.data)
    model = formula.fit()
    return sm.stats.anova_lm(model, type=2)
Esempio n. 9
0
def friedman(dataset: Dataset, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs:
        cat = [k for k, v in x.metadata[categories].items()]
        for c in cat:
            cat_data = dataset.select(y.metadata[name],
                                      where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)

    return stats.friedmanchisquare(*data)
Esempio n. 10
0
def add_eq_variance_property(dataset, combined_data: CombinedData,
                             study_type: str):
    xs = None
    ys = None
    cat_xs = []
    cont_ys = []
    grouped_data = []

    if study_type == experiment_identifier:
        # Just need one variable to be Catogrical and another to be Continuous (regardless of role) -- both could be variable_identifier types
        xs = combined_data.get_vars(iv_identifier)
        ys = combined_data.get_vars(dv_identifier)

    else:  # study_type == observational_identifier
        xs = combined_data.get_vars(contributor_identifier)
        ys = combined_data.get_vars(outcome_identifier)

    for x in xs:
        if x.is_categorical():
            cat_xs.append(x)

    for y in ys:
        if y.is_continuous():
            cont_ys.append(y)

    combined_data.properties[eq_variance] = None

    if cat_xs and cont_ys:
        for y in ys:
            for x in xs:
                cat = [k for k, v in x.metadata[categories].items()]
                for c in cat:
                    data = dataset.select(
                        y.metadata[name],
                        where=[f"{x.metadata[name]} == '{c}'"])
                    grouped_data.append(data)
                if isinstance(combined_data, BivariateData):
                    # Equal variance
                    eq_var = compute_eq_variance(grouped_data)
                    combined_data.properties[eq_variance] = eq_var
                elif isinstance(combined_data, MultivariateData):
                    combined_data.properties[
                        eq_variance + '::' + x.metadata[name] + ':' +
                        y.metadata[name]] = compute_eq_variance(grouped_data)
                else:
                    raise ValueError(
                        f"combined_data_data object is neither BivariateData nor MultivariateData: {type(combined_data)}"
                    )
Esempio n. 11
0
def greater_than_5_frequency(dataset: Dataset, var_data: CombinedData, alpha): 
    xs = var_data.get_explanatory_variables()
    ys = var_data.get_explained_variables()

    if len(xs) == 1: 
        if len(ys) == 1: 
            x = xs[0]
            y = ys[0]

            if x.is_categorical() and y.is_categorical(): 

                # Get the count for each category
                x_cat = [k for k,v in x.metadata[categories].items()]
                y_cat = [k for k,v in y.metadata[categories].items()]

                for xc in x_cat: 
                    for yc in y_cat: 
                        data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"])                    

                        # Check that the count is at least five for each of the (x,y) group pairs
                        if (len(data) < 5): 
                            return False
                
                return True
            else: 
                return False
        else: 
            raise ValueError(f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}")    
    else: 
        x0 = xs[0]
        x1 = xs[1]
        
        if x0.is_categorical() and x1.is_categorical():
            # Get the count for each category
            x0_cat = [k for k,v in x0.metadata[categories].items()]
            x1_cat = [k for k,v in x1.metadata[categories].items()]

            for x0c in x0_cat: 
                for x1c in x1_cat: 
                    data = dataset.select(x1.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{x1.metadata[name]} == '{x1c}'"])                    

                    # Check that the count is at least five for each of the (x,x1) group pairs
                    if (len(data) < 5): 
                        return False
            return True
        else: 
            return False
Esempio n. 12
0
def pointbiserial(dataset: Dataset, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(xs) == 1)
    assert (len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    return stats.pointbiserialr(data[0], data[1])
Esempio n. 13
0
def add_categories_normal(dataset,
                          combined_data: CombinedData,
                          study_type: str,
                          design: Dict[str, str] = None):
    global cat_distribution

    xs = None
    ys = None
    cat_xs = []
    cont_ys = []
    grouped_data = dict()

    if study_type == experiment_identifier:
        # Just need one variable to be Catogrical and another to be Continuous (regardless of role) -- both could be variable_identifier types
        xs = combined_data.get_vars(iv_identifier)
        ys = combined_data.get_vars(dv_identifier)

    else:  # study_type == observational_identifier
        xs = combined_data.get_vars(contributor_identifier)
        ys = combined_data.get_vars(outcome_identifier)

    for x in xs:
        if x.is_categorical():
            cat_xs.append(x)

    for y in ys:
        if y.is_continuous():
            cont_ys.append(y)

    combined_data.properties[cat_distribution] = None

    if cat_xs and cont_ys:
        for y in ys:
            for x in xs:
                cat = [k for k, v in x.metadata[categories].items()]
                for c in cat:
                    data = dataset.select(
                        y.metadata[name],
                        where=[f"{x.metadata[name]} == '{c}'"])
                    grouped_data_name = str(x.metadata[name] + ':' + c)
                    grouped_data[grouped_data_name] = compute_distribution(
                        data)
                combined_data.properties[cat_distribution] = dict()
                combined_data.properties[cat_distribution][
                    y.metadata[name] + '::' + x.metadata[name]] = grouped_data
Esempio n. 14
0
def kruskall_wallis(dataset: Dataset, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs:
        if x.metadata[categories] is None:
            import pdb
            pdb.set_trace()
        cat = [k for k, v in x.metadata[categories].items()]
        for c in cat:
            cat_data = dataset.select(y.metadata[name],
                                      where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)

    return stats.kruskal(*data)
Esempio n. 15
0
def fishers_exact(dataset: Dataset, combined_data: CombinedData):
    assert (len(combined_data.vars) == 2)

    # Compute the contingency table
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(xs) == 1)
    assert (len(ys) == 1)

    x = xs[0]
    y = ys[0]

    # Get the count for each category
    x_cat = [k for k, v in x.metadata[categories].items()]
    y_cat = [k for k, v in y.metadata[categories].items()]

    contingency_table = []
    contingency_table_key = [
    ]  # labels for the order in which data is stored in data array (define above)

    for xc in x_cat:
        table_row = []
        table_row_key = []
        for yc in y_cat:
            data = dataset.select(y.metadata[name],
                                  where=[
                                      f"{x.metadata[name]} == '{xc}'",
                                      f"{y.metadata[name]} == '{yc}'"
                                  ])
            table_row.append(len(data))

            x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(
                y.metadata[name]) + ':' + str(yc)
            table_row_key.append(x_y_key)

        assert (len(table_row_key) == len(table_row))
        assert (len(table_row) == len(y_cat))
        contingency_table.append(table_row)
        contingency_table_key.append(table_row_key)

    odds_ratio, p_value = stats.fisher_exact(contingency_table,
                                             alternative='two-sided')
    return FishersResult(odds_ratio, p_value)
Esempio n. 16
0
def has_equal_variance(dataset: Dataset, var_data: CombinedData, alpha):
    xs = var_data.get_explanatory_variables()
    ys = var_data.get_explained_variables()
    cat_xs = []
    cont_ys = []
    grouped_data = []


    for x in xs: 
        if x.is_categorical(): 
            cat_xs.append(x)
    
    for y in ys: 
        if y.is_continuous(): 
            cont_ys.append(y)
    
    eq_var = (None, None)
    if cat_xs and cont_ys: 
        for y in ys:
            for x in xs: 
                cat = [k for k,v in x.metadata[categories].items()]
                for c in cat: 
                    data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
                    grouped_data.append(data)
                if isinstance(var_data, BivariateData):
                    # Equal variance
                    eq_var = compute_eq_variance(grouped_data)
                # elif isinstance(var_data, MultivariateData):
                #     var_data.properties[eq_variance + '::' + x.metadata[name] + ':' + y.metadata[name]] = compute_eq_variance(grouped_data)
                else: 
                    raise ValueError(f"var_data_data object is neither BivariateData nor MultivariateData: {type(var_data)}")

    if eq_var[0] is None and eq_var[1] is None:
        import pdb; pdb.set_trace()
        # raise Exception("did not compute variance, this is a bug")
        return False

    return (eq_var[1] > alpha)
Esempio n. 17
0
def has_one_y(dataset: Dataset, var_data: CombinedData, alpha): 
    ys = var_data.get_explained_variables()

    return len(ys) == 1
Esempio n. 18
0
def has_one_x(dataset: Dataset, var_data: CombinedData, alpha): 
    xs = var_data.get_explanatory_variables()

    return len(xs) == 1
Esempio n. 19
0
def synthesize_tests(dataset: Dataset, assumptions: Dict[str,str], combined_data: CombinedData):    
    construct_all_tests(combined_data)

    global name
    stat_var_map = {}

    # Reorder variables so that y var is at the end
    combined_data._update_vars() 

    # Compute unique statisical variable names from the combined data.
    combined_data_vars = []
    for v in combined_data.vars:
        var = StatVar(v.metadata[name])
        stat_var_map[v.metadata[name]] = var 
        combined_data_vars.append(var)

    # Assume properties are True based on user assumptions
    solver = z3.Solver()
    # s = Tactic('qflia').solver()
    assumed_props = assume_properties(stat_var_map, assumptions, solver)

    # Update the arity of test-level properties
    for prop in test_props: 
        prop._update(len(combined_data.vars))

    # print(combined_data)
    # Apply all tests to the variables we are considering now in combined_data
    for test in all_tests(): 
        test.apply(*combined_data_vars)

    solver.push() # Create backtracking point
    model = None # Store model

    # For each test, add it to the solver as a constraint. 
    # Add the tests and their properties
    for test in all_tests():
        log(f"\nCurrently considering {test.name}")
        solver.add(test.__z3__ == z3.And(*test.query()))
        solver.add(test.__z3__ == z3.BoolVal(True))

        # Check the model 
        result = solver.check()
        if result == z3.unsat:
            # import pdb; pdb.set_trace()
            log("Test is unsat.\n")
            # print("no more solutions")
            # print(solver.num_scopes())
            solver.pop() 
            # model = solver.model() # may need to do a check before call model
        elif result == z3.unknown:
            print("failed to solve")
            try:
                # print(solver.model())
                pass
            except z3.Z3Exception:
                return
        else:
            model = solver.model()
            test_invalid = False
            # Does the test apply?
            # Would this ever be false??
            if model and z3.is_true(model.evaluate(test.__z3__)):
                # Verify the properties for that test
                for prop in test._properties:
                    log(f"Testing assumption: {prop._name}.")
                    need_to_verify = True
                    # If the prop was assumed by the user, skip verification.
                    for ap in assumed_props:
                        if prop == ap: 
                            log(f"Property was a user assumption. ")
                            prop.property_test_results = "Assumed true."
                            need_to_verify = False
                    if need_to_verify: 
                        # Does this property need to hold for the test to be valid?
                        # If so, verify that the property does hold
                        if model and z3.is_true(model.evaluate(prop.__z3__)):
                            val = verify_prop(dataset, combined_data, prop)
                            if val: 
                                log(f"Property holds.")
                            if not val: 
                                # if test.name == 'f_test':
                                #     import pdb; pdb.set_trace()
                                if not test_invalid: # The property does not check
                                    log(f"Property FAILS")
                                    solver.pop() # remove the last test
                                    test_invalid = True
                                    model = None
                                else: # test is already invalid. Going here just for completeness of logging
                                    log(f"EVER GET HERE?")
                            solver.add(prop.__z3__ == z3.BoolVal(val))
        solver.push() # Push latest state as backtracking point
        
    solver.check()
    model = solver.model() # final model
    # import pdb; pdb.set_trace()
    tests_to_conduct = []
    # Could add all the test props first 
    # Then add all the tests 
    for test in all_tests():
        if model and z3.is_true(model.evaluate(test.__z3__)):
            tests_to_conduct.append(test.name)
        elif not model: # No test applies
            pass

    reset_all_tests()
    # import pdb; pdb.set_trace()
    return tests_to_conduct
Esempio n. 20
0
                              properties=categorical_properties,
                              role=iv_identifier)
nominal_dependent = VarData(metadata=nominal_metadata,
                            properties=categorical_properties,
                            role=dv_identifier)
ordinal_dependent = VarData(metadata=ordinal_metadata,
                            properties=categorical_properties,
                            role=dv_identifier)
ordinal_var = VarData(metadata=ordinal_metadata,
                      properties=categorical_properties,
                      role=null_identifier)
nominal_var = VarData(metadata=nominal_metadata,
                      properties=categorical_properties,
                      role=null_identifier)

cont_not_specified = CombinedData(
    vars=[normal_continuous_var, normal_continuous_var])
cont_diff_sample_size = CombinedData(
    vars=[normal_continuous_var, normal_continuous_large_sample])
cont_iv_dv = CombinedData(
    vars=[normal_continuous_dependent, normal_continuous_independent])
nominal_iv_cont_dv = CombinedData(
    vars=[normal_continuous_dependent, nominal_independent])
ordinal_iv_cont_dv = CombinedData(
    vars=[normal_continuous_dependent, ordinal_independent])
cont_iv_nominal_dv = CombinedData(
    vars=[nominal_dependent, normal_continuous_independent])
cont_iv_ordinal_dv = CombinedData(
    vars=[ordinal_dependent, normal_continuous_independent])
ordinal_not_specified = CombinedData(vars=[ordinal_var, ordinal_var])
nominal_not_specified = CombinedData(vars=[nominal_var, nominal_var])
ordinal_nominal_not_specified = CombinedData(vars=[ordinal_var, nominal_var])