Esempio n. 1
0
def greater_than_5_frequency(dataset: Dataset, var_data: CombinedData, alpha):
    xs = var_data.get_explanatory_variables()
    ys = var_data.get_explained_variables()

    if len(xs) == 1:
        if len(ys) == 1:
            x = xs[0]
            y = ys[0]

            if x.is_categorical() and y.is_categorical():

                # Get the count for each category
                x_cat = [k for k, v in x.metadata[categories].items()]
                y_cat = [k for k, v in y.metadata[categories].items()]

                for xc in x_cat:
                    for yc in y_cat:
                        data = dataset.select(
                            y.metadata[name],
                            where=[
                                f"{x.metadata[name]} == '{xc}'",
                                f"{y.metadata[name]} == '{yc}'"
                            ])

                        # Check that the count is at least five for each of the (x,y) group pairs
                        if (len(data) < 5):
                            return False

                return True
            else:
                return False
        else:
            raise ValueError(
                f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}"
            )
    else:
        x0 = xs[0]
        x1 = xs[1]

        if x0.is_categorical() and x1.is_categorical():
            # Get the count for each category
            x0_cat = [k for k, v in x0.metadata[categories].items()]
            x1_cat = [k for k, v in x1.metadata[categories].items()]

            for x0c in x0_cat:
                for x1c in x1_cat:
                    data = dataset.select(x1.metadata[name],
                                          where=[
                                              f"{x.metadata[name]} == '{xc}'",
                                              f"{x1.metadata[name]} == '{x1c}'"
                                          ])

                    # Check that the count is at least five for each of the (x,x1) group pairs
                    if (len(data) < 5):
                        return False
            return True
        else:
            return False
def fishers_exact(dataset: Dataset, predictions, combined_data: CombinedData): 
    assert(len(combined_data.vars) == 2)

    # Compute the contingency table
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert(len(xs) == 1)
    assert(len(ys) == 1)

    x = xs[0]
    y = ys[0]

    # Get the count for each category
    x_cat = [k for k,v in x.metadata[categories].items()]
    y_cat = [k for k,v in y.metadata[categories].items()]

    contingency_table = []
    contingency_table_key = [] # labels for the order in which data is stored in data array (define above)

    for xc in x_cat: 
        table_row = []
        table_row_key = []
        for yc in y_cat: 
            data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"])
            table_row.append(len(data))

            x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(y.metadata[name]) + ':' + str(yc)
            table_row_key.append(x_y_key)
        
        assert(len(table_row_key) == len(table_row))
        assert(len(table_row) == len(y_cat))
        contingency_table.append(table_row)
        contingency_table_key.append(table_row_key)

    # odds_ratio, p_value = stats.fisher_exact(contingency_table, alternative='two-sided')
    # return FishersResult(odds_ratio, p_value)

    if predictions: 
        if isinstance(predictions[0], list): 
            prediction = predictions[0][0]
        else: 
            prediction = predictions[0]
    else: 
        prediction = None
    odds_ratio, p_val = stats.fisher_exact(contingency_table, alternative='two-sided')
    dof = None
    test_result = TestResult( 
                        name = fisher_exact_name,
                        test_statistic = odds_ratio,
                        p_value = p_val,
                        prediction = prediction,
                        dof = dof,
                        alpha = combined_data.alpha,
                        x = x,
                        y = y)
    
    return test_result
Esempio n. 3
0
def rm_one_way_anova(dataset: Dataset, predictions, design,
                     combined_data: CombinedData):
    data = dataset.data
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(ys) == 1)
    y = ys[0]
    between_subjs = []
    within_subjs = []
    for x in xs:
        if "between subjects" in design and design[
                "between subjects"] == x.metadata[name]:
            between_subjs.append(x.metadata[name])
        if "within subjects" in design and design[
                "within subjects"] == x.metadata[name]:
            within_subjs.append(x.metadata[name])

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None

    key = dataset.pid_col_name
    aovrm2way = AnovaRM(data,
                        depvar=y.metadata[name],
                        subject=key,
                        within=within_subjs,
                        aggregate_func='mean')
    # aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=dataset.pid_col_name, within=within_subjs, between=between_subjs) # apparently not implemented in statsmodels
    res2way = aovrm2way.fit()
    result_df = res2way.anova_table

    col_name = x.metadata[name]
    for row_name in result_df.index:
        if row_name == col_name:
            row_data = result_df.loc[row_name]
            test_statistic = row_data['F Value']
            p_val = row_data['Pr > F']
            dof = (row_data['Num DF'], row_data['Den DF'])

    test_result = TestResult(name=rm_one_way_anova_name,
                             test_statistic=test_statistic,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha,
                             table=result_df,
                             x=x,
                             y=y)

    return test_result
Esempio n. 4
0
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(xs) == 1)
    assert (len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    if len(data[0]) == len(
            data[1]
    ):  # Scipy requires that groups have equal sizes even though this is not technically a requirement of the Pointbiserial correlation
        corr, p_val = stats.pointbiserialr(data[0], data[1])
    else:
        # Compute pointbiserial correlation on our own
        data_all = data[0].append(data[1])

        group_0_mean = np.mean(data[0])
        group_0_size = len(data[0])
        group_1_mean = np.mean(data[1])
        group_1_size = len(data[1])

        sample_size = group_0_size + group_1_size
        assert (sample_size == len(data_all))
        sample_std = stats.tstd(data_all)

        corr = (group_0_mean - group_1_mean) / sample_std * math.sqrt(
            (group_0_size * group_1_size) / (sample_size * (sample_size - 1)))
        t_stat, p_val = stats.ttest_ind(data[0], data[1], equal_var=True)

    dof = None
    test_result = TestResult(name=POINTBISERIAL_NAME,
                             test_statistic=corr,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha)

    return test_result
Esempio n. 5
0
def f_test(dataset: Dataset, predictions, combined_data: CombinedData):  
    # Construct formula
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert(len(xs) == 1)
    assert(len(ys) == 1)

    x = xs[0]
    y = ys[0]
    
    formula = ols(f"{y.metadata[name]} ~ C({x.metadata[name]})", data=dataset.data)
    model =formula.fit()
    return sm.stats.anova_lm(model, type=2)
Esempio n. 6
0
def wilcoxon_signed_rank(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k,v in x.metadata[categories].items()]
    data = []

    for c in cat: 
        cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)
    
    return stats.wilcoxon(data[0], data[1])
Esempio n. 7
0
def friedman(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs: 
        cat = [k for k,v in x.metadata[categories].items()]
        for c in cat: 
            cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)
    
    return stats.friedmanchisquare(*data)
Esempio n. 8
0
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert(len(xs) == 1)
    assert(len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k,v in x.metadata[categories].items()]
    data = []

    for c in cat: 
        cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)
    
    return stats.pointbiserialr(data[0], data[1])
Esempio n. 9
0
def kruskall_wallis(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs: 
        if x.metadata[categories] is None: 
            raise ValueError('')
        cat = [k for k,v in x.metadata[categories].items()]
        for c in cat: 
            cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)
    
    return stats.kruskal(*data)
Esempio n. 10
0
def f_test(dataset: Dataset, predictions, combined_data: CombinedData):  
    # Construct formula
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert(len(xs) == 1)
    assert(len(ys) == 1)

    x = xs[0]
    y = ys[0]
    
    formula = ols(f"{y.metadata[name]} ~ C({x.metadata[name]})", data=dataset.data)
    model =formula.fit()
    

    if predictions:
        if isinstance(predictions[0], list): 
            prediction = predictions[0][0]
        else: 
            prediction = predictions[0]
    else: 
        prediction = None
    result_df = sm.stats.anova_lm(model, type=2)
    # Need to inspect the result_df and return the appropriate test_statistic/p_value pair based on the prediction
    col_name = "C(" + x.metadata[name] + ")"
    for row_name in result_df.index: 
        if row_name == col_name: 
            row_data = result_df.loc[row_name]
            test_statistic = row_data['F']
            p_val = row_data['PR(>F)']
            dof = row_data['df']

    test_result = TestResult( 
                        name = f_test_name,
                        test_statistic = test_statistic,
                        p_value = p_val,
                        prediction = prediction,
                        dof = dof,
                        alpha = combined_data.alpha,
                        table = result_df,
                        x=x,
                        y=y)
    
    return test_result
Esempio n. 11
0
def has_equal_variance(dataset: Dataset, var_data: CombinedData, alpha):
    xs = var_data.get_explanatory_variables()
    ys = var_data.get_explained_variables()
    cat_xs = []
    cont_ys = []
    grouped_data = []

    for x in xs:
        if x.is_categorical():
            cat_xs.append(x)

    for y in ys:
        if y.is_continuous():
            cont_ys.append(y)

    eq_var = (None, None)
    if cat_xs and cont_ys:
        for y in ys:
            for x in xs:
                cat = [k for k, v in x.metadata[categories].items()]
                for c in cat:
                    data = dataset.select(
                        y.metadata[name],
                        where=[f"{x.metadata[name]} == '{c}'"])
                    grouped_data.append(data)
                if isinstance(var_data, BivariateData):
                    # Equal variance
                    eq_var = compute_eq_variance(grouped_data)
                # elif isinstance(var_data, MultivariateData):
                #     var_data.properties[eq_variance + '::' + x.metadata[name] + ':' + y.metadata[name]] = compute_eq_variance(grouped_data)
                else:
                    raise ValueError(
                        f"var_data_data object is neither BivariateData nor MultivariateData: {type(var_data)}"
                    )

    if eq_var[0] is None and eq_var[1] is None:
        import pdb
        pdb.set_trace()
        # raise Exception("did not compute variance, this is a bug")
        return False

    return (eq_var[1] > alpha)
Esempio n. 12
0
def chi_square(dataset: Dataset, predictions, combined_data: CombinedData): 
    # Compute the contingency table
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    if len(xs) == 1: 
        if len(ys) == 1: 
            x = xs[0]
            y = ys[0]

            # Get the count for each category
            x_cat = [k for k,v in x.metadata[categories].items()]
            y_cat = [k for k,v in y.metadata[categories].items()]

            contingency_table = []
            contingency_table_key = [] # labels for the order in which data is stored in data array (define above)

            for xc in x_cat: 
                table_row = []
                table_row_key = []
                for yc in y_cat: 
                    data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"])
                    table_row.append(len(data))

                    x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(y.metadata[name]) + ':' + str(yc)
                    table_row_key.append(x_y_key)
                
                assert(len(table_row_key) == len(table_row))
                assert(len(table_row) == len(y_cat))
                contingency_table.append(table_row)
                contingency_table_key.append(table_row_key)
            
        else: 
            raise ValueError(f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}")    
    else: 
        raise ValueError(f"Currently, chi square requires/only supports 1 explanatory variable, instead received: {len(xs)} -- {xs}")

    # chi2, p, dof, ex = chi2_contingency(obs, correction=False)

    chi2, p, dof, ex = stats.chi2_contingency(contingency_table, correction=False)
    return ChisquareResult(chi2, p, dof, ex)
Esempio n. 13
0
def rm_one_way_anova(dataset: Dataset, design, combined_data: CombinedData): 
    data = dataset.data
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert(len(ys) == 1)
    y = ys[0]
    between_subjs = []
    within_subjs = []
    for x in xs: 
        if "between subjects" in design and design["between subjects"] == x.metadata[name]:
            between_subjs.append(x.metadata[name])
        if "within subjects" in design and design["within subjects"] == x.metadata[name]:
            within_subjs.append(x.metadata[name])
    
    # import pdb; pdb.set_trace()
    key = dataset.pid_col_name
    # import pdb; pdb.set_trace()
    aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=key, within=within_subjs, aggregate_func='mean')
    # aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=dataset.pid_col_name, within=within_subjs, between=between_subjs) # apparently not implemented in statsmodels
    res2way = aovrm2way.fit()
    return res2way
Esempio n. 14
0
def bootstrap(dataset: Dataset, predictions, combined_data: CombinedData):
    calculations = {}

    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    for y in ys: 
        # for now
        assert(len(ys) == 1)
        
        # Main effects
        for x in xs: 
            cat = [k for k,v in x.metadata[categories].items()]
            for c in cat: 
                cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
                stat = bs.bootstrap(cat_data.to_numpy(), stat_func=bs_stats.median)
                calculations[c] = stat
                # import pdb; pdb.set_trace()
                # store all the medians & confidence intervals
                # return all the medians & CIs
                # data.append(cat_data)
    
    return calculations
Esempio n. 15
0
def kruskall_wallis(dataset: Dataset, predictions,
                    combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs:
        if x.metadata[categories] is None:
            raise ValueError('')
        cat = [k for k, v in x.metadata[categories].items()]
        for c in cat:
            cat_data = dataset.select(y.metadata[name],
                                      where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    t_stat, p_val = stats.kruskal(*data)
    dof = len(data[0])  # TODO This might not be correct
    test_result = TestResult(
        name=kruskall_wallis_name,
        test_statistic=t_stat,
        p_value=p_val,
        prediction=prediction,
        dof=dof,
        alpha=combined_data.alpha,
        x=xs[0],  # TODO: Not sure if it's possible to have multiple x's?
        y=y)

    return test_result
Esempio n. 16
0
def cohens(dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k,v in x.metadata[categories].items()]
    data = []

    pred = None
    if predictions:
        pred = predictions[0][0]
    
    lhs = None
    rhs = None
    for c in cat: 
        cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
        if c == pred.lhs.value:
            lhs = cat_data
        if c == pred.rhs.value:
            rhs = cat_data
        data.append(cat_data)
    
    cohens_d = (mean(lhs) - mean(rhs)) / (sqrt((stdev(lhs) ** 2 + stdev(rhs) ** 2) / 2))
    return cohens_d
Esempio n. 17
0
def factorial_ANOVA(dataset: Dataset, predictions, combined_data: CombinedData): 

    # Construct formula
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert(len(ys) == 1)

    y = ys[0]
    
    formula = f"{y.metadata[name]} ~ "

    for i in range(len(xs)): 
        x = xs[i]
        formula += f"C({x.metadata[name]})"

        if i < len(xs) - 1: 
            formula += " + "
    
    
    # Add the interactions
    interactions = []
    for i in range(len(xs)): 
        x_i = xs[i]
        inter = f"C({x_i.metadata[name]})" 
        for j in range(len(xs)):
            if i != j: 
                x_j = xs[j]
                inter += " * " + f"C({x_j.metadata[name]})"
                interactions.append(inter)
                
                if _is_interaction_unique(interactions, inter):
                    formula += " + " +  inter

    ols_formula = ols(formula, data=dataset.data)
    model = ols_formula.fit()
    return sm.stats.anova_lm(model, type=2)
Esempio n. 18
0
def has_one_y(dataset: Dataset, var_data: CombinedData, alpha):
    ys = var_data.get_explained_variables()

    return len(ys) == 1
Esempio n. 19
0
def factorial_ANOVA(dataset: Dataset, predictions,
                    combined_data: CombinedData):

    # Construct formula
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)

    y = ys[0]

    formula = f"{y.metadata[name]} ~ "

    for i in range(len(xs)):
        x = xs[i]
        formula += f"C({x.metadata[name]})"

        if i < len(xs) - 1:
            formula += " + "

    # Add the interactions
    interactions = []
    for i in range(len(xs)):
        x_i = xs[i]
        inter = f"C({x_i.metadata[name]})"
        for j in range(len(xs)):
            if i != j:
                x_j = xs[j]
                inter += " * " + f"C({x_j.metadata[name]})"
                interactions.append(inter)

                if _is_interaction_unique(interactions, inter):
                    formula += " + " + inter

    ols_formula = ols(formula, data=dataset.data)
    model = ols_formula.fit()
    result_df = sm.stats.anova_lm(model, type=2)
    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None

    col_name = "C(" + x.metadata[name] + ")"
    for row_name in result_df.index:
        if row_name == col_name:
            row_data = result_df.loc[row_name]
            test_statistic = row_data['F']
            p_val = row_data['PR(>F)']
            dof = row_data['df']

    test_result = TestResult(name=factorial_anova_name,
                             test_statistic=test_statistic,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha,
                             table=result_df,
                             y=y,
                             x=xs[0])

    return test_result
Esempio n. 20
0
def bootstrap(dataset: Dataset, predictions, combined_data: CombinedData):
    calculations = {}

    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    for y in ys:
        # for now
        assert (len(ys) == 1)

        # Main effects
        for x in xs:
            cat = [k for k, v in x.metadata[categories].items()]
            for c in cat:
                cat_data = dataset.select(
                    y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
                stat = bs.bootstrap(cat_data.to_numpy(),
                                    stat_func=bs_stats.median)
                calculations[c] = stat

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None

    x = xs[0]  # We should do this for the prediction, only....?
    cat = [k for k, v in x.metadata[categories].items()]
    test_statistic = {}
    p_val = None
    for c in cat:
        # import pdb; pdb.set_trace()
        lb = calculations[c].lower_bound
        ub = calculations[c].upper_bound

        test_statistic[c] = (lb, ub)

    alpha = combined_data.alpha
    lb = None
    ub = None
    for group, bounds in test_statistic.items():
        if not lb:
            assert (not ub)
            lb = bounds[0]
            ub = bounds[1]
        else:
            if bounds[0] >= lb and bounds[0] <= ub:
                p_val = f'Greater than or equal to {alpha}'
            elif bounds[1] >= lb and bounds[1] <= ub:
                p_val = f'Greater than or equal to {alpha}'
            else:
                p_val = f'Less than {alpha}'

    dof = None
    test_result = TestResult(name="Bootstrap",
                             test_statistic=test_statistic,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha,
                             table=calculations)

    return test_result