Esempio n. 1
0
def rm_one_way_anova(dataset: Dataset, design, combined_data: CombinedData):
    data = dataset.data
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(ys) == 1)
    y = ys[0]
    between_subjs = []
    within_subjs = []
    for x in xs:
        if "between subjects" in design and design[
                "between subjects"] == x.metadata[name]:
            between_subjs.append(x.metadata[name])
        if "within subjects" in design and design[
                "within subjects"] == x.metadata[name]:
            within_subjs.append(x.metadata[name])

    # import pdb; pdb.set_trace()
    id = dataset.pid_col_name
    aovrm2way = AnovaRM(data,
                        depvar=y.metadata[name],
                        subject=id,
                        within=within_subjs)
    # aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=dataset.pid_col_name, within=within_subjs, between=between_subjs) # apparently not implemented in statsmodels
    # import pdb; pdb.set_trace()
    res2way = aovrm2way.fit()
Esempio n. 2
0
def bootstrap(dataset: Dataset, combined_data: CombinedData):
    calculations = {}

    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    for y in ys:
        # for now
        assert (len(ys) == 1)

        # Main effects
        for x in xs:
            cat = [k for k, v in x.metadata[categories].items()]
            for c in cat:
                cat_data = dataset.select(
                    y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
                stat = bs.bootstrap(cat_data.to_numpy(),
                                    stat_func=bs_stats.median)
                calculations[c] = stat
                # import pdb; pdb.set_trace()
                # store all the medians & confidence intervals
                # return all the medians & CIs
                # data.append(cat_data)

    return calculations
Esempio n. 3
0
def factorial_ANOVA(dataset: Dataset, combined_data: CombinedData):

    # Construct formula
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)

    y = ys[0]

    formula = f"{y.metadata[name]} ~ "

    for i in range(len(xs)):
        x = xs[i]
        formula += f"C({x.metadata[name]})"

        if i < len(xs) - 1:
            formula += " + "

    # Add the interactions
    interactions = []
    for i in range(len(xs)):
        x_i = xs[i]
        inter = f"C({x_i.metadata[name]})"
        for j in range(len(xs)):
            if i != j:
                x_j = xs[j]
                inter += " * " + f"C({x_j.metadata[name]})"
                interactions.append(inter)

                if _is_interaction_unique(interactions, inter):
                    formula += " + " + inter

    ols_formula = ols(formula, data=dataset.data)
    model = ols_formula.fit()
    return sm.stats.anova_lm(model, type=2)
Esempio n. 4
0
def chi_square(dataset: Dataset, combined_data: CombinedData):
    # Compute the contingency table
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    if len(xs) == 1:
        if len(ys) == 1:
            x = xs[0]
            y = ys[0]

            # Get the count for each category
            x_cat = [k for k, v in x.metadata[categories].items()]
            y_cat = [k for k, v in y.metadata[categories].items()]

            contingency_table = []
            contingency_table_key = [
            ]  # labels for the order in which data is stored in data array (define above)

            for xc in x_cat:
                table_row = []
                table_row_key = []
                for yc in y_cat:
                    data = dataset.select(y.metadata[name],
                                          where=[
                                              f"{x.metadata[name]} == '{xc}'",
                                              f"{y.metadata[name]} == '{yc}'"
                                          ])
                    table_row.append(len(data))

                    x_y_key = str(
                        x.metadata[name]) + ':' + str(xc) + ' by ' + str(
                            y.metadata[name]) + ':' + str(yc)
                    table_row_key.append(x_y_key)

                assert (len(table_row_key) == len(table_row))
                assert (len(table_row) == len(y_cat))
                contingency_table.append(table_row)
                contingency_table_key.append(table_row_key)

        else:
            raise ValueError(
                f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}"
            )
    else:
        raise ValueError(
            f"Currently, chi square requires/only supports 1 explanatory variable, instead received: {len(xs)} -- {xs}"
        )

    # chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    chi2, p, dof, ex = stats.chi2_contingency(contingency_table,
                                              correction=False)
    return ChisquareResult(chi2, p, dof, ex)
Esempio n. 5
0
def f_test(dataset: Dataset, combined_data: CombinedData):
    # Construct formula
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(xs) == 1)
    assert (len(ys) == 1)

    x = xs[0]
    y = ys[0]

    formula = ols(f"{y.metadata[name]} ~ C({x.metadata[name]})",
                  data=dataset.data)
    model = formula.fit()
    return sm.stats.anova_lm(model, type=2)
Esempio n. 6
0
def wilcoxon_signed_rank(dataset: Dataset, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    return stats.wilcoxon(data[0], data[1])
Esempio n. 7
0
def paired_students_t(dataset, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    return stats.ttest_rel(data[0], data[1])
Esempio n. 8
0
def friedman(dataset: Dataset, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs:
        cat = [k for k, v in x.metadata[categories].items()]
        for c in cat:
            cat_data = dataset.select(y.metadata[name],
                                      where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)

    return stats.friedmanchisquare(*data)
Esempio n. 9
0
def greater_than_5_frequency(dataset: Dataset, var_data: CombinedData, alpha): 
    xs = var_data.get_explanatory_variables()
    ys = var_data.get_explained_variables()

    if len(xs) == 1: 
        if len(ys) == 1: 
            x = xs[0]
            y = ys[0]

            if x.is_categorical() and y.is_categorical(): 

                # Get the count for each category
                x_cat = [k for k,v in x.metadata[categories].items()]
                y_cat = [k for k,v in y.metadata[categories].items()]

                for xc in x_cat: 
                    for yc in y_cat: 
                        data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"])                    

                        # Check that the count is at least five for each of the (x,y) group pairs
                        if (len(data) < 5): 
                            return False
                
                return True
            else: 
                return False
        else: 
            raise ValueError(f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}")    
    else: 
        x0 = xs[0]
        x1 = xs[1]
        
        if x0.is_categorical() and x1.is_categorical():
            # Get the count for each category
            x0_cat = [k for k,v in x0.metadata[categories].items()]
            x1_cat = [k for k,v in x1.metadata[categories].items()]

            for x0c in x0_cat: 
                for x1c in x1_cat: 
                    data = dataset.select(x1.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{x1.metadata[name]} == '{x1c}'"])                    

                    # Check that the count is at least five for each of the (x,x1) group pairs
                    if (len(data) < 5): 
                        return False
            return True
        else: 
            return False
Esempio n. 10
0
def pointbiserial(dataset: Dataset, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(xs) == 1)
    assert (len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    return stats.pointbiserialr(data[0], data[1])
Esempio n. 11
0
def kruskall_wallis(dataset: Dataset, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs:
        if x.metadata[categories] is None:
            import pdb
            pdb.set_trace()
        cat = [k for k, v in x.metadata[categories].items()]
        for c in cat:
            cat_data = dataset.select(y.metadata[name],
                                      where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)

    return stats.kruskal(*data)
Esempio n. 12
0
def fishers_exact(dataset: Dataset, combined_data: CombinedData):
    assert (len(combined_data.vars) == 2)

    # Compute the contingency table
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(xs) == 1)
    assert (len(ys) == 1)

    x = xs[0]
    y = ys[0]

    # Get the count for each category
    x_cat = [k for k, v in x.metadata[categories].items()]
    y_cat = [k for k, v in y.metadata[categories].items()]

    contingency_table = []
    contingency_table_key = [
    ]  # labels for the order in which data is stored in data array (define above)

    for xc in x_cat:
        table_row = []
        table_row_key = []
        for yc in y_cat:
            data = dataset.select(y.metadata[name],
                                  where=[
                                      f"{x.metadata[name]} == '{xc}'",
                                      f"{y.metadata[name]} == '{yc}'"
                                  ])
            table_row.append(len(data))

            x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(
                y.metadata[name]) + ':' + str(yc)
            table_row_key.append(x_y_key)

        assert (len(table_row_key) == len(table_row))
        assert (len(table_row) == len(y_cat))
        contingency_table.append(table_row)
        contingency_table_key.append(table_row_key)

    odds_ratio, p_value = stats.fisher_exact(contingency_table,
                                             alternative='two-sided')
    return FishersResult(odds_ratio, p_value)
Esempio n. 13
0
def has_equal_variance(dataset: Dataset, var_data: CombinedData, alpha):
    xs = var_data.get_explanatory_variables()
    ys = var_data.get_explained_variables()
    cat_xs = []
    cont_ys = []
    grouped_data = []


    for x in xs: 
        if x.is_categorical(): 
            cat_xs.append(x)
    
    for y in ys: 
        if y.is_continuous(): 
            cont_ys.append(y)
    
    eq_var = (None, None)
    if cat_xs and cont_ys: 
        for y in ys:
            for x in xs: 
                cat = [k for k,v in x.metadata[categories].items()]
                for c in cat: 
                    data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
                    grouped_data.append(data)
                if isinstance(var_data, BivariateData):
                    # Equal variance
                    eq_var = compute_eq_variance(grouped_data)
                # elif isinstance(var_data, MultivariateData):
                #     var_data.properties[eq_variance + '::' + x.metadata[name] + ':' + y.metadata[name]] = compute_eq_variance(grouped_data)
                else: 
                    raise ValueError(f"var_data_data object is neither BivariateData nor MultivariateData: {type(var_data)}")

    if eq_var[0] is None and eq_var[1] is None:
        import pdb; pdb.set_trace()
        # raise Exception("did not compute variance, this is a bug")
        return False

    return (eq_var[1] > alpha)
Esempio n. 14
0
def has_one_x(dataset: Dataset, var_data: CombinedData, alpha): 
    xs = var_data.get_explanatory_variables()

    return len(xs) == 1