Example #1
0
def vda(dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    pred = None
    if predictions:
        pred = predictions[0][0]

    lhs = None
    rhs = None
    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        if c == pred.lhs.value:
            lhs = cat_data
        if c == pred.rhs.value:
            rhs = cat_data
        data.append(cat_data)

    m = len(lhs)
    n = len(rhs)
    concat = lhs.append(rhs)
    r = stats.rankdata(concat)
    r1 = sum(r[range(0, m)])

    # Compute the measure
    # A = (r1/m - (m+1)/2)/n # formula (14) in Vargha and Delaney, 2000
    A = (2 * r1 - m *
         (m + 1)) / (2 * n * m)  # equivalent formula to avoid accuracy errors
    return A
Example #2
0
    def __init__(self, test_to_results, combined_data: CombinedData):
        self.test_to_results = test_to_results
        self.test_to_assumptions = {}
        for test in __ALL_TESTS__:
            if test.name in test_to_results:
                test_assumptions = []
                # TODO: The names get stale if hypothesize() is called multiple times in a row.
                for applied_prop in test._properties:
                    assumption = f"{applied_prop.property.description}: "

                    if applied_prop.property.name == "has_one_x":
                        assumption += combined_data.get_explanatory_variables(
                        )[0].metadata[name]
                    elif applied_prop.property.name == "has_one_y":
                        assumption += combined_data.get_explained_variables(
                        )[0].metadata[name]
                    elif applied_prop.property.name == "has_independent_observations" or applied_prop.property.name == "has_paired_observations":
                        assumption += combined_data.get_explanatory_variables(
                        )[0].metadata[name]
                    else:
                        for stat_var in applied_prop.vars:
                            assumption += f"{stat_var.name}, "
                        assumption = assumption[:-2]

                    if applied_prop.property_test_results is not None:
                        assumption += f": {applied_prop.property_test_results}"
                    test_assumptions.append(assumption)

                self.test_to_assumptions[test.name] = test_assumptions
Example #3
0
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(xs) == 1)
    assert (len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    t_stat, p_val = stats.pointbiserialr(data[0], data[1])
    dof = None
    test_result = TestResult(name=pointbiserial_name,
                             test_statistic=t_stat,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha)

    return test_result
Example #4
0
def friedman(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs:
        cat = [k for k, v in x.metadata[categories].items()]
        for c in cat:
            cat_data = dataset.select(y.metadata[name],
                                      where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)

    # return stats.friedmanchisquare(*data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    test_statistic, p_val = stats.friedmanchisquare(*data)
    dof = len(data[0])  # TODO This might not be correct
    test_result = TestResult(name="Kruskall Wallis Test",
                             test_statistic=test_statistic,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha)

    return test_result
Example #5
0
def paired_students_t(dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    t_stat, p_val = stats.ttest_rel(data[0], data[1])
    dof = (len(data[0]) + len(data[1])) / 2. - 1  # (Group1 + Group2)/2 - 1
    test_result = TestResult(name=paired_students_name,
                             test_statistic=t_stat,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha,
                             x=x,
                             y=y)

    return test_result
Example #6
0
def add_paired_property(
        dataset,
        combined_data: CombinedData,
        study_type: str,
        design: Dict[str, str] = None):  # check same sizes are identical
    global paired

    x = None
    y = None
    combined_data.properties[paired] = False
    if isinstance(combined_data, BivariateData):
        if study_type == experiment_identifier:
            # Just need one variable to be Categorical and another to be Continuous (regardless of role)
            x = combined_data.get_vars(iv_identifier)
            y = combined_data.get_vars(dv_identifier)

        else:  # study_type == observational_identifier
            x = combined_data.get_vars(contributor_identifier)
            y = combined_data.get_vars(outcome_identifier)

        if x and y:
            assert (len(x) == len(y) == 1)
            x = x[0]
            y = y[0]

            if x.is_categorical() and y.is_continuous():
                if within_subj in design and design[within_subj] == x.metadata[
                        name]:
                    combined_data.properties[paired] = True
Example #7
0
def cohens(dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    pred = None
    if predictions:
        pred = predictions[0][0]

    lhs = None
    rhs = None
    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        if c == pred.lhs.value:
            lhs = cat_data
        if c == pred.rhs.value:
            rhs = cat_data
        data.append(cat_data)

    cohens_d = (mean(lhs) - mean(rhs)) / (sqrt(
        (stdev(lhs)**2 + stdev(rhs)**2) / 2))
    return cohens_d
Example #8
0
def wilcoxon_signed_rank(dataset: Dataset, predictions,
                         combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    t_stat, p_val = stats.wilcoxon(data[0], data[1])
    dof = len(data[0])  # TODO This might not be correct
    test_result = TestResult(name=wilcoxon_signed_rank_name,
                             test_statistic=t_stat,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha,
                             x=x,
                             y=y)

    return test_result
def kruskall_wallis(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs: 
        if x.metadata[categories] is None: 
            raise ValueError('')
        cat = [k for k,v in x.metadata[categories].items()]
        for c in cat: 
            cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)
    
    if predictions: 
        if isinstance(predictions[0], list): 
            prediction = predictions[0][0]
        else: 
            prediction = predictions[0]
    else: 
        prediction = None
    t_stat, p_val = stats.kruskal(*data)
    dof = len(data[0]) # TODO This might not be correct
    test_result = TestResult( 
                        name = kruskall_wallis_name,
                        test_statistic = t_stat,
                        p_value = p_val,
                        prediction = prediction,
                        dof = dof,
                        alpha = combined_data.alpha,
                        x = xs[0], # TODO: Not sure if it's possible to have multiple x's?
                        y = y)
    
    return test_result
def chi_square(dataset: Dataset, predictions, combined_data: CombinedData): 
    # Compute the contingency table
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    if len(xs) == 1: 
        if len(ys) == 1: 
            x = xs[0]
            y = ys[0]

            # Get the count for each category
            x_cat = [k for k,v in x.metadata[categories].items()]
            y_cat = [k for k,v in y.metadata[categories].items()]

            contingency_table = []
            contingency_table_key = [] # labels for the order in which data is stored in data array (define above)

            for xc in x_cat: 
                table_row = []
                table_row_key = []
                for yc in y_cat: 
                    data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"])
                    table_row.append(len(data))

                    x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(y.metadata[name]) + ':' + str(yc)
                    table_row_key.append(x_y_key)
                
                assert(len(table_row_key) == len(table_row))
                assert(len(table_row) == len(y_cat))
                contingency_table.append(table_row)
                contingency_table_key.append(table_row_key)
            
        else: 
            raise ValueError(f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}")    
    else: 
        raise ValueError(f"Currently, chi square requires/only supports 1 explanatory variable, instead received: {len(xs)} -- {xs}")

    # chi2, p, dof, ex = chi2_contingency(obs, correction=False)
    # chi2, p, dof, ex = stats.chi2_contingency(contingency_table, correction=False)

    if predictions: 
        if isinstance(predictions[0], list): 
            prediction = predictions[0][0]
        else: 
            prediction = predictions[0]
    else: 
        prediction = None
    test_statistic, p_val, dof, ex = stats.chi2_contingency(contingency_table, correction=False)
    dof = None
    test_result = TestResult( 
                        name = chi_square_name,
                        test_statistic = test_statistic,
                        p_value = p_val,
                        prediction = prediction,
                        dof = dof,
                        alpha = combined_data.alpha,
                        x = x,
                        y = y)
    
    return test_result
Example #11
0
def greater_than_5_frequency(dataset: Dataset, var_data: CombinedData, alpha):
    xs = var_data.get_explanatory_variables()
    ys = var_data.get_explained_variables()

    if len(xs) == 1:
        if len(ys) == 1:
            x = xs[0]
            y = ys[0]

            if x.is_categorical() and y.is_categorical():

                # Get the count for each category
                x_cat = [k for k, v in x.metadata[categories].items()]
                y_cat = [k for k, v in y.metadata[categories].items()]

                for xc in x_cat:
                    for yc in y_cat:
                        data = dataset.select(
                            y.metadata[name],
                            where=[
                                f"{x.metadata[name]} == '{xc}'",
                                f"{y.metadata[name]} == '{yc}'"
                            ])

                        # Check that the count is at least five for each of the (x,y) group pairs
                        if len(data) < 5:
                            return False

                return True
            else:
                return False
        else:
            raise ValueError(
                f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}"
            )
    else:
        x0 = xs[0]
        x1 = xs[1]

        if x0.is_categorical() and x1.is_categorical():
            # Get the count for each category
            x0_cat = [k for k, v in x0.metadata[categories].items()]
            x1_cat = [k for k, v in x1.metadata[categories].items()]

            for x0c in x0_cat:
                for x1c in x1_cat:
                    data = dataset.select(x1.metadata[name],
                                          where=[
                                              f"{x.metadata[name]} == '{xc}'",
                                              f"{x1.metadata[name]} == '{x1c}'"
                                          ])

                    # Check that the count is at least five for each of the (x,x1) group pairs
                    if len(data) < 5:
                        return False
            return True
        else:
            return False
def fishers_exact(dataset: Dataset, predictions, combined_data: CombinedData): 
    assert(len(combined_data.vars) == 2)

    # Compute the contingency table
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert(len(xs) == 1)
    assert(len(ys) == 1)

    x = xs[0]
    y = ys[0]

    # Get the count for each category
    x_cat = [k for k,v in x.metadata[categories].items()]
    y_cat = [k for k,v in y.metadata[categories].items()]

    contingency_table = []
    contingency_table_key = [] # labels for the order in which data is stored in data array (define above)

    for xc in x_cat: 
        table_row = []
        table_row_key = []
        for yc in y_cat: 
            data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"])
            table_row.append(len(data))

            x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(y.metadata[name]) + ':' + str(yc)
            table_row_key.append(x_y_key)
        
        assert(len(table_row_key) == len(table_row))
        assert(len(table_row) == len(y_cat))
        contingency_table.append(table_row)
        contingency_table_key.append(table_row_key)

    # odds_ratio, p_value = stats.fisher_exact(contingency_table, alternative='two-sided')
    # return FishersResult(odds_ratio, p_value)

    if predictions: 
        if isinstance(predictions[0], list): 
            prediction = predictions[0][0]
        else: 
            prediction = predictions[0]
    else: 
        prediction = None
    odds_ratio, p_val = stats.fisher_exact(contingency_table, alternative='two-sided')
    dof = None
    test_result = TestResult( 
                        name = fisher_exact_name,
                        test_statistic = odds_ratio,
                        p_value = p_val,
                        prediction = prediction,
                        dof = dof,
                        alpha = combined_data.alpha,
                        x = x,
                        y = y)
    
    return test_result
Example #13
0
def rm_one_way_anova(dataset: Dataset, predictions, design,
                     combined_data: CombinedData):
    data = dataset.data
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(ys) == 1)
    y = ys[0]
    between_subjs = []
    within_subjs = []
    for x in xs:
        if "between subjects" in design and design[
                "between subjects"] == x.metadata[name]:
            between_subjs.append(x.metadata[name])
        if "within subjects" in design and design[
                "within subjects"] == x.metadata[name]:
            within_subjs.append(x.metadata[name])

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None

    key = dataset.pid_col_name
    aovrm2way = AnovaRM(data,
                        depvar=y.metadata[name],
                        subject=key,
                        within=within_subjs,
                        aggregate_func='mean')
    # aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=dataset.pid_col_name, within=within_subjs, between=between_subjs) # apparently not implemented in statsmodels
    res2way = aovrm2way.fit()
    result_df = res2way.anova_table

    col_name = x.metadata[name]
    for row_name in result_df.index:
        if row_name == col_name:
            row_data = result_df.loc[row_name]
            test_statistic = row_data['F Value']
            p_val = row_data['Pr > F']
            dof = (row_data['Num DF'], row_data['Den DF'])

    test_result = TestResult(name=rm_one_way_anova_name,
                             test_statistic=test_statistic,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha,
                             table=result_df,
                             x=x,
                             y=y)

    return test_result
Example #14
0
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(xs) == 1)
    assert (len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    if len(data[0]) == len(
            data[1]
    ):  # Scipy requires that groups have equal sizes even though this is not technically a requirement of the Pointbiserial correlation
        corr, p_val = stats.pointbiserialr(data[0], data[1])
    else:
        # Compute pointbiserial correlation on our own
        data_all = data[0].append(data[1])

        group_0_mean = np.mean(data[0])
        group_0_size = len(data[0])
        group_1_mean = np.mean(data[1])
        group_1_size = len(data[1])

        sample_size = group_0_size + group_1_size
        assert (sample_size == len(data_all))
        sample_std = stats.tstd(data_all)

        corr = (group_0_mean - group_1_mean) / sample_std * math.sqrt(
            (group_0_size * group_1_size) / (sample_size * (sample_size - 1)))
        t_stat, p_val = stats.ttest_ind(data[0], data[1], equal_var=True)

    dof = None
    test_result = TestResult(name=POINTBISERIAL_NAME,
                             test_statistic=corr,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha)

    return test_result
Example #15
0
def f_test(dataset: Dataset, predictions, combined_data: CombinedData):  
    # Construct formula
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert(len(xs) == 1)
    assert(len(ys) == 1)

    x = xs[0]
    y = ys[0]
    
    formula = ols(f"{y.metadata[name]} ~ C({x.metadata[name]})", data=dataset.data)
    model =formula.fit()
    return sm.stats.anova_lm(model, type=2)
Example #16
0
def wilcoxon_signed_rank(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k,v in x.metadata[categories].items()]
    data = []

    for c in cat: 
        cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)
    
    return stats.wilcoxon(data[0], data[1])
Example #17
0
def friedman(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs: 
        cat = [k for k,v in x.metadata[categories].items()]
        for c in cat: 
            cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)
    
    return stats.friedmanchisquare(*data)
Example #18
0
def add_eq_variance_property(dataset, combined_data: CombinedData,
                             study_type: str):
    xs = None
    ys = None
    cat_xs = []
    cont_ys = []
    grouped_data = []

    if study_type == experiment_identifier:
        # Just need one variable to be Catogrical and another to be Continuous (regardless of role) -- both could be variable_identifier types
        xs = combined_data.get_vars(iv_identifier)
        ys = combined_data.get_vars(dv_identifier)

    else:  # study_type == observational_identifier
        xs = combined_data.get_vars(contributor_identifier)
        ys = combined_data.get_vars(outcome_identifier)

    for x in xs:
        if x.is_categorical():
            cat_xs.append(x)

    for y in ys:
        if y.is_continuous():
            cont_ys.append(y)

    combined_data.properties[eq_variance] = None

    if cat_xs and cont_ys:
        for y in ys:
            for x in xs:
                cat = [k for k, v in x.metadata[categories].items()]
                for c in cat:
                    data = dataset.select(
                        y.metadata[name],
                        where=[f"{x.metadata[name]} == '{c}'"])
                    grouped_data.append(data)
                if isinstance(combined_data, BivariateData):
                    # Equal variance
                    eq_var = compute_eq_variance(grouped_data)
                    combined_data.properties[eq_variance] = eq_var
                elif isinstance(combined_data, MultivariateData):
                    combined_data.properties[
                        eq_variance + '::' + x.metadata[name] + ':' +
                        y.metadata[name]] = compute_eq_variance(grouped_data)
                else:
                    raise ValueError(
                        f"combined_data_data object is neither BivariateData nor MultivariateData: {type(combined_data)}"
                    )
Example #19
0
def kruskall_wallis(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs: 
        if x.metadata[categories] is None: 
            raise ValueError('')
        cat = [k for k,v in x.metadata[categories].items()]
        for c in cat: 
            cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)
    
    return stats.kruskal(*data)
Example #20
0
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert(len(xs) == 1)
    assert(len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k,v in x.metadata[categories].items()]
    data = []

    for c in cat: 
        cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)
    
    return stats.pointbiserialr(data[0], data[1])
Example #21
0
def add_categories_normal(dataset,
                          combined_data: CombinedData,
                          study_type: str,
                          design: Dict[str, str] = None):
    global cat_distribution

    xs = None
    ys = None
    cat_xs = []
    cont_ys = []
    grouped_data = dict()

    if study_type == experiment_identifier:
        # Just need one variable to be Catogrical and another to be Continuous (regardless of role) -- both could be variable_identifier types
        xs = combined_data.get_vars(iv_identifier)
        ys = combined_data.get_vars(dv_identifier)

    else:  # study_type == observational_identifier
        xs = combined_data.get_vars(contributor_identifier)
        ys = combined_data.get_vars(outcome_identifier)

    for x in xs:
        if x.is_categorical():
            cat_xs.append(x)

    for y in ys:
        if y.is_continuous():
            cont_ys.append(y)

    combined_data.properties[cat_distribution] = None

    if cat_xs and cont_ys:
        for y in ys:
            for x in xs:
                cat = [k for k, v in x.metadata[categories].items()]
                for c in cat:
                    data = dataset.select(
                        y.metadata[name],
                        where=[f"{x.metadata[name]} == '{c}'"])
                    grouped_data_name = str(x.metadata[name] + ':' + c)
                    grouped_data[grouped_data_name] = compute_distribution(
                        data)
                combined_data.properties[cat_distribution] = dict()
                combined_data.properties[cat_distribution][
                    y.metadata[name] + '::' + x.metadata[name]] = grouped_data
def f_test(dataset: Dataset, predictions, combined_data: CombinedData):  
    # Construct formula
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert(len(xs) == 1)
    assert(len(ys) == 1)

    x = xs[0]
    y = ys[0]
    
    formula = ols(f"{y.metadata[name]} ~ C({x.metadata[name]})", data=dataset.data)
    model =formula.fit()
    

    if predictions:
        if isinstance(predictions[0], list): 
            prediction = predictions[0][0]
        else: 
            prediction = predictions[0]
    else: 
        prediction = None
    result_df = sm.stats.anova_lm(model, type=2)
    # Need to inspect the result_df and return the appropriate test_statistic/p_value pair based on the prediction
    col_name = "C(" + x.metadata[name] + ")"
    for row_name in result_df.index: 
        if row_name == col_name: 
            row_data = result_df.loc[row_name]
            test_statistic = row_data['F']
            p_val = row_data['PR(>F)']
            dof = row_data['df']

    test_result = TestResult( 
                        name = f_test_name,
                        test_statistic = test_statistic,
                        p_value = p_val,
                        prediction = prediction,
                        dof = dof,
                        alpha = combined_data.alpha,
                        table = result_df,
                        x=x,
                        y=y)
    
    return test_result
Example #23
0
def has_equal_variance(dataset: Dataset, var_data: CombinedData, alpha):
    xs = var_data.get_explanatory_variables()
    ys = var_data.get_explained_variables()
    cat_xs = []
    cont_ys = []
    grouped_data = []

    for x in xs:
        if x.is_categorical():
            cat_xs.append(x)

    for y in ys:
        if y.is_continuous():
            cont_ys.append(y)

    eq_var = (None, None)
    if cat_xs and cont_ys:
        for y in ys:
            for x in xs:
                cat = [k for k, v in x.metadata[categories].items()]
                for c in cat:
                    data = dataset.select(
                        y.metadata[name],
                        where=[f"{x.metadata[name]} == '{c}'"])
                    grouped_data.append(data)
                if isinstance(var_data, BivariateData):
                    # Equal variance
                    eq_var = compute_eq_variance(grouped_data)
                # elif isinstance(var_data, MultivariateData):
                #     var_data.properties[eq_variance + '::' + x.metadata[name] + ':' + y.metadata[name]] = compute_eq_variance(grouped_data)
                else:
                    raise ValueError(
                        f"var_data_data object is neither BivariateData nor MultivariateData: {type(var_data)}"
                    )

    if eq_var[0] is None and eq_var[1] is None:
        import pdb
        pdb.set_trace()
        # raise Exception("did not compute variance, this is a bug")
        return False

    return (eq_var[1] > alpha)
Example #24
0
def rm_one_way_anova(dataset: Dataset, design, combined_data: CombinedData): 
    data = dataset.data
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert(len(ys) == 1)
    y = ys[0]
    between_subjs = []
    within_subjs = []
    for x in xs: 
        if "between subjects" in design and design["between subjects"] == x.metadata[name]:
            between_subjs.append(x.metadata[name])
        if "within subjects" in design and design["within subjects"] == x.metadata[name]:
            within_subjs.append(x.metadata[name])
    
    # import pdb; pdb.set_trace()
    key = dataset.pid_col_name
    # import pdb; pdb.set_trace()
    aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=key, within=within_subjs, aggregate_func='mean')
    # aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=dataset.pid_col_name, within=within_subjs, between=between_subjs) # apparently not implemented in statsmodels
    res2way = aovrm2way.fit()
    return res2way
Example #25
0
def bootstrap(dataset: Dataset, predictions, combined_data: CombinedData):
    calculations = {}

    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    for y in ys: 
        # for now
        assert(len(ys) == 1)
        
        # Main effects
        for x in xs: 
            cat = [k for k,v in x.metadata[categories].items()]
            for c in cat: 
                cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
                stat = bs.bootstrap(cat_data.to_numpy(), stat_func=bs_stats.median)
                calculations[c] = stat
                # import pdb; pdb.set_trace()
                # store all the medians & confidence intervals
                # return all the medians & CIs
                # data.append(cat_data)
    
    return calculations
Example #26
0
def factorial_ANOVA(dataset: Dataset, predictions, combined_data: CombinedData): 

    # Construct formula
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert(len(ys) == 1)

    y = ys[0]
    
    formula = f"{y.metadata[name]} ~ "

    for i in range(len(xs)): 
        x = xs[i]
        formula += f"C({x.metadata[name]})"

        if i < len(xs) - 1: 
            formula += " + "
    
    
    # Add the interactions
    interactions = []
    for i in range(len(xs)): 
        x_i = xs[i]
        inter = f"C({x_i.metadata[name]})" 
        for j in range(len(xs)):
            if i != j: 
                x_j = xs[j]
                inter += " * " + f"C({x_j.metadata[name]})"
                interactions.append(inter)
                
                if _is_interaction_unique(interactions, inter):
                    formula += " + " +  inter

    ols_formula = ols(formula, data=dataset.data)
    model = ols_formula.fit()
    return sm.stats.anova_lm(model, type=2)
Example #27
0
def has_one_x(dataset: Dataset, var_data: CombinedData, alpha):
    xs = var_data.get_explanatory_variables()

    return len(xs) == 1
Example #28
0
def factorial_ANOVA(dataset: Dataset, predictions,
                    combined_data: CombinedData):

    # Construct formula
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)

    y = ys[0]

    formula = f"{y.metadata[name]} ~ "

    for i in range(len(xs)):
        x = xs[i]
        formula += f"C({x.metadata[name]})"

        if i < len(xs) - 1:
            formula += " + "

    # Add the interactions
    interactions = []
    for i in range(len(xs)):
        x_i = xs[i]
        inter = f"C({x_i.metadata[name]})"
        for j in range(len(xs)):
            if i != j:
                x_j = xs[j]
                inter += " * " + f"C({x_j.metadata[name]})"
                interactions.append(inter)

                if _is_interaction_unique(interactions, inter):
                    formula += " + " + inter

    ols_formula = ols(formula, data=dataset.data)
    model = ols_formula.fit()
    result_df = sm.stats.anova_lm(model, type=2)
    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None

    col_name = "C(" + x.metadata[name] + ")"
    for row_name in result_df.index:
        if row_name == col_name:
            row_data = result_df.loc[row_name]
            test_statistic = row_data['F']
            p_val = row_data['PR(>F)']
            dof = row_data['df']

    test_result = TestResult(name=factorial_anova_name,
                             test_statistic=test_statistic,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha,
                             table=result_df,
                             y=y,
                             x=xs[0])

    return test_result
Example #29
0
def has_one_y(dataset: Dataset, var_data: CombinedData, alpha):
    ys = var_data.get_explained_variables()

    return len(ys) == 1
Example #30
0
def bootstrap(dataset: Dataset, predictions, combined_data: CombinedData):
    calculations = {}

    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    for y in ys:
        # for now
        assert (len(ys) == 1)

        # Main effects
        for x in xs:
            cat = [k for k, v in x.metadata[categories].items()]
            for c in cat:
                cat_data = dataset.select(
                    y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
                stat = bs.bootstrap(cat_data.to_numpy(),
                                    stat_func=bs_stats.median)
                calculations[c] = stat

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None

    x = xs[0]  # We should do this for the prediction, only....?
    cat = [k for k, v in x.metadata[categories].items()]
    test_statistic = {}
    p_val = None
    for c in cat:
        # import pdb; pdb.set_trace()
        lb = calculations[c].lower_bound
        ub = calculations[c].upper_bound

        test_statistic[c] = (lb, ub)

    alpha = combined_data.alpha
    lb = None
    ub = None
    for group, bounds in test_statistic.items():
        if not lb:
            assert (not ub)
            lb = bounds[0]
            ub = bounds[1]
        else:
            if bounds[0] >= lb and bounds[0] <= ub:
                p_val = f'Greater than or equal to {alpha}'
            elif bounds[1] >= lb and bounds[1] <= ub:
                p_val = f'Greater than or equal to {alpha}'
            else:
                p_val = f'Less than {alpha}'

    dof = None
    test_result = TestResult(name="Bootstrap",
                             test_statistic=test_statistic,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha,
                             table=calculations)

    return test_result