예제 #1
0
def pearson_test(x, y):
    """Found in scipy.stats as pearsonr
    Used to evaluate the pearson correlation between X and Y.

    Parameters
    ----------
    x: list or numpy array, 1-D
        Our "X" variable for determining the strength of our pearson correlation with
    y: list or numpy array, 1-D
        Our "Y" variable for determining the strength of our pearson correlation with

    Returns
    -------
    rho: float, -1 <= rho <= 1
        Our measure of pearson correlation between x and y
    p: float, 0 <= p <= 1
        How significant our observed pearson correlation is
    """
    x, y = _check_table(x, only_count=False), _check_table(y, only_count=False)
    if len(x) != len(y):
        raise ValueError(
            "Cannot calculate correlation with datasets of different lengths")
    n = len(x)
    rho = (n * np.sum(x * y) - np.sum(x) * np.sum(y)) / (
        sqrt(n * np.sum(np.power(x, 2)) - pow(np.sum(x), 2)) *
        sqrt(n * np.sum(np.power(y, 2)) - pow(np.sum(y), 2)))
    t_stat = rho * sqrt((n - 2) / (1 - pow(rho, 2)))
    p = 2 * (1 - t.cdf(abs(t_stat), n - 2))
    return rho, p
예제 #2
0
def rank_biserial_correlation_test(x, y):
    """Not found in scipy.stats or statsmodels

    x: list or numpy array, 1-D
        Our observations. These are expected to be ordinal
    y: list or numpy array, 1-D
        Our groupings variable, or masked array. Must only have two variables and be the same length as x

    Returns
    -------
    rho: float
        The measure of correlation between our two groups
    p: float
        The likelihood that our two groups would be correlated if both were derived from a normal distribution
    """
    x, y = _check_table(x, only_count=True), _check_table(y, only_count=True)
    if len(x) != len(y):
        raise ValueError("X and Y must be of the same length")
    if len(np.unique(y)) != 2:
        raise AttributeError(
            "Need to have two groupings for biseral correlation")
    group_0, group_1 = x[y == np.unique(y)[0]], x[y == np.unique(y)[1]]
    mu_1, mu_0 = np.mean(group_1), np.mean(group_0)
    n, n_1, n_0 = len(x), len(group_1), len(group_0)
    s = sqrt(n_1 * n_0 * (n + 1) / 12)
    rho = 2 * ((mu_1 - mu_0) / (n_1 + n_0))
    u_min = min((1 + rho) * n_1 * n_0 / 2, (1 - rho) * n_1 * n_0 / 2)
    mu = n_1 * n_0 / 2
    z = (u_min - mu) / s
    p = 2 * (1 - norm.cdf(abs(z)))
    return rho, p
예제 #3
0
def spearman_test(x, y):
    """Found in scipy.stats as spearmanr
    Used to evaluate the correlation between the ranks of "X" and "Y", that is, if there exists a
    monotonic relationship between X and Y.

    Parameters
    ----------
    x: list or numpy array, 1-D
        Our "X" variable for determining the strength of monotonic correlation with
    y: list or numpy array, 1-D
        Our "Y" variable for determining the strength of monotonic correlation with

    Returns
    -------
    rho: float, -1 <= t_stat <= 1
        Our measure of monotonic correlation between x and y
    p: float, 0 <= p <= 1
        How significant our observed monotonic correlation is
    """
    x, y = _check_table(x, only_count=False), _check_table(y, only_count=False)
    if len(x) != len(y):
        raise ValueError(
            "Cannot calculate correlation with datasets of different lengths")
    df = len(x) - 2
    rank_x, rank_y = rankdata(x), rankdata(y)
    std_x, std_y = np.std(rank_x, ddof=1), np.std(rank_y, ddof=1)
    cov = np.cov(rank_x, rank_y)[0][1]
    rho = cov / (std_x * std_y)
    t_stat = rho * sqrt(df / (1 - pow(rho, 2)))
    p = 2 * (1 - t.cdf(abs(t_stat), df))
    return rho, p
def chi_goodness_of_fit_test(observed, expected=None):
    """Found in scipy.stats as chisquare
    Used when we cannot divide the data cleanly into a contingency table or when we have actual expected results to
    compare to.

    Parameters
    ----------
    observed: list or numpy array, 1-D
        Our observed data points
    expected: list or numpy array, 1-D, default is None
        What we expected the results to be. If None given, then we expect all data points to be equally likely

    Returns
    -------
    X: float
        The Chi statistic, or the sum of squared differences between observed and expected
    p: float, 0 <= p <= 1
        The likelihood that our observed differences, given the amount of data, can be attributed to chance
    """
    observed = _check_table(observed, False)
    if not expected:
        expected = np.repeat(np.mean(observed), len(observed))
    else:
        expected = _check_table(expected)
    df = len(observed) - 1
    X = np.sum(np.power(observed - expected, 2) / expected)
    p = 1 - chi2.cdf(X, df)
    return X, p
예제 #5
0
def point_biserial_correlation_test(x, y):
    """Found in scipy.stats as pointbiserialr

    x: list or numpy array, 1-D
        Our observations. These are expected to be continuous.
    y: list or numpy array, 1-D
        Our groupings variable, or masked array. Must only have two variables and be the same length as x

    Returns
    -------
    rho: float
        The measure of correlation between our two groups
    p: float
        The likelihood that our two groups would be correlated if both were derived from a t (if point) distribution
    """
    x = _check_table(x, only_count=False)
    y = _check_table(y, only_count=True)
    if len(x) != len(y):
        raise ValueError("X and Y must be of the same length")
    if len(np.unique(y)) != 2:
        raise AttributeError(
            "Need to have two groupings for biseral correlation")
    group_0, group_1 = x[y == np.unique(y)[0]], x[y == np.unique(y)[1]]
    mu_1, mu_0 = np.mean(group_1), np.mean(group_0)
    n, n_1, n_0 = len(x), len(group_1), len(group_0)
    s = np.std(x, ddof=1)
    rho = ((mu_1 - mu_0) / s) * sqrt(n_1 * n_0 / (n * (n - 1)))
    t_val = rho * sqrt((n - 2) / (1 - pow(rho, 2)))
    p = 2 * (1 - t.cdf(abs(t_val), n - 2))
    return rho, p
def lepage_test(data_1, data_2):
    """Not found in either scipy.stats or statsmodels
    Used to compare the central tendency and variability in two samples. A sum of the squared Euclidean distances of both
    the Wilcoxon-Rank-Sum test and the Ansari-Bradley test.

    Parameters
    ----------
    data_1: list or numpy array, 1-D
        A list or array containing all observations from our first dataset
    data_2: list or numpy array, 1-D
        A list or array containing all observations from our second dataset

    Returns
    -------
    d: float
        Our measure of central tendency and variability among the two datasets
    p: float, 0 <= p <= 1
        The likelihood we would find this level of central tendency and variability among two datasets sampled from the
        same population
    """
    data_1, data_2 = _check_table(data_1, only_count=False), _check_table(data_2, only_count=False)
    n, m = len(data_1), len(data_2)
    N = n + m
    c, _ = ansari_bradley_test(data_1, data_2, alternative='two-sided')
    w, _ = two_sample_wilcoxon_test(data_1, data_2, alternative='two-sided')
    expected_w = n * (N + 1) / 2
    sd_w = sqrt(m * n * (N + 1) / 12)
    expected_c = n * pow(N + 1, 2) / (4 * N)
    sd_c = sqrt(m * n * (N + 1) * (3 + pow(N, 2)) / (48 * pow(N, 2)))
    d = pow((w - expected_w) / sd_w, 2) + pow((c - expected_c) / sd_c, 2)
    p = 1 - chi2.cdf(d, 2)
    return d, p
def g_goodness_of_fit_test(observed, expected=None):
    """Found in scipy.stats as power_divergence(lambda_="log-likelihood")
    Similar to chi_goodness_of_fit_test, used when we cannot divide the data cleanly into a contingency table or when we
    have actual expected results to compare to.

    Parameters
    ----------
    observed: list or numpy array, 1-D
        Our observed data
    expected: list or numpy array, default is None
        What we expected the results to be. If None given, then we expect all data points to be equally likely

    Returns
    -------
    g: float
        The G statistic, or the likelihood ratio of the difference between observed and expected
    p: float, 0 <= p <= 1
        The likelihood that our observed differences are due to chance
    """
    observed = _check_table(observed, False)
    if not expected:
        expected = np.repeat(np.mean(observed), len(observed))
    else:
        expected = _check_table(expected)
    df = len(observed) - 1
    g = 2 * np.sum(observed * np.log(observed / expected))
    p = 1 - chi2.cdf(g, df)
    return g, p
예제 #8
0
def two_sample_t_test(data_1, data_2, alternative='two-sided', paired=False):
    """This test can be found in scipy.stats as either ttest_rel or ttest_ind
    Used when we want to compare the distributions of two samples, and while we assume that they both follow a normal
    distribution, their sample size is too small to reliably use a z-test.

    Parameters
    ----------
    data_1: list or numpy array, 1-D
        The observed dataset we are comparing to data_2
    data_2: list or numpy array, 1-D
        The observed dataset we are comparing to data_1
    alternative: str, {two-sided, greater, less}, default is two-sided
        Our alternative hypothesis
    paired: bool, default is False
        Whether or not data_1 and data_2 are paired observations

    Return
    ------
    t_value: number
        The t statistic for the difference between our datasets
    p: float, 0 <= p <= 1
        The likelihood that the observed differences are due to chance
    """
    if not isinstance(alternative, str):
        raise TypeError("Alternative Hypothesis is not of string type")
    if alternative.casefold() not in ['two-sided', 'greater', 'less']:
        raise ValueError("Cannot determine method for alternative hypothesis")
    data_1, data_2 = _check_table(data_1, False), _check_table(data_2, False)
    data_1_mean, data_2_mean = np.mean(data_1), np.mean(data_2)
    if paired:
        """This test can be found in scipy.stats as ttest_rel"""
        if len(data_1) != len(data_2):
            raise AttributeError("The data types are not paired")
        n = len(data_1)
        df = n - 1
        squared_difference = np.sum(np.power(data_1 - data_2, 2))
        difference = np.sum(data_1 - data_2)
        std = sqrt((squared_difference - (np.power(difference, 2) / n)) / df)
        standard_error_difference = _standard_error(std, n)

    else:
        # We perform the Welch T-Test due to assumption that variances are not equal
        """This test can be found in scipy.stats as ttest_ind"""
        data_1_var, data_2_var = np.var(data_1, ddof=1), np.var(data_2, ddof=1)
        data_1_n, data_2_n = len(data_1), len(data_2)
        df = np.power((data_1_var / data_1_n) + (data_2_var / data_2_n), 2) /\
             ((np.power(data_1_var, 2) / (np.power(data_1_n, 2) * data_1_n - 1)) +
              (np.power(data_2_var, 2) / (np.power(data_2_n, 2) * data_2_n - 1)))
        standard_error_difference = sqrt((data_1_var / data_1_n) +
                                         (data_2_var / data_2_n))
    t_value = (data_1_mean - data_2_mean) / standard_error_difference
    p = (1.0 - t.cdf(abs(t_value), df))
    if alternative.casefold() == 'two-sided':
        p *= 2
    elif alternative.casefold() == 'less':
        p = 1 - p
    else:
        pass
    return t_value, p
예제 #9
0
def yeun_welch_test(data_1, data_2, p=10, alternative='two-sided'):
    """Not found in scipy.stats or statsmodels.
    Used when we wish to perform a two-sample t-test, but cannot assume normality or equality of variances.

    Parameters
    ----------
    data_1: list or numpy array, 1-D
        The observed dataset we are comparing to data_2
    data_2: list or numpy array, 1-D
        The observed dataset we are comparing to data_1
    p: float, 0 <= p <= 100
        The percentage of data we wish to drop from each sample
    alternative: str, {two-sided, greater, less}, default is two-sided
        Our alternative hypothesis

    Return
    ------
    t_value: number
        The t statistic for the difference between our datasets
    p: float, 0 <= p <= 1
        The likelihood that the observed differences are due to chance
    """
    if p < 0 or p > 100:
        raise ValueError("Percentage trimmed needs to be between 0 and 100")
    if alternative.casefold() not in ['two-sided', 'greater', 'less']:
        raise ValueError("Cannot determine method for alternative hypothesis")
    data_1, data_2 = _check_table(data_1), _check_table(data_2)
    sort_data_1, sort_data_2 = np.sort(data_1), np.sort(data_2)
    n_1, n_2 = len(data_1) * p // 200, len(data_2) * p // 200
    trim_data_1, trim_data_2 = sort_data_1[
        n_1:len(sort_data_1) - n_1], sort_data_2[n_2:len(sort_data_2) - n_2]
    n_x, n_y = len(data_1), len(data_2)
    m_x, m_y = len(trim_data_1), len(trim_data_2)
    winsor_values_1, winsor_values_2 = np.append(
        trim_data_1[0] * n_1,
        trim_data_1[-1] * n_1), np.append(trim_data_2[0] * n_2,
                                          trim_data_2[-1] * n_2)
    winsor_data_1, winsor_data_2 = np.append(trim_data_1,
                                             winsor_values_1), np.append(
                                                 trim_data_2, winsor_values_2)
    s_x, s_y = np.var(winsor_data_1, ddof=1), np.var(winsor_data_2, ddof=1)
    x_bar, y_bar = np.mean(trim_data_1), np.mean(trim_data_2)
    d_x, d_y = (n_x - 1) * s_x / (m_x *
                                  (m_x - 1)), (n_y - 1) * s_y / (m_y *
                                                                 (m_y - 1))
    df = pow(d_x + d_y, 2) / (pow(d_x, 2) / (m_x - 1) + pow(d_y, 2) /
                              (m_y - 1))
    t_value = (x_bar - y_bar) / sqrt(d_x + d_y)
    p = 1 - t.cdf(t_value, df // 1)
    if alternative.casefold() == 'two-sided':
        p *= 2
    elif alternative.casefold() == 'less':
        p = 1 - p
    else:
        pass
    return t_value, p
예제 #10
0
def g_proportion_test(success_prob, n_total, expected=None):
    """Not found in either statsmodels or scipy.stats
    Used when we are given proportions of success (as well as total participants) instead of
    numbers of success

    Parameters
    ----------
    success_prob: list or numpy array, 1-D
        A list containing the percentage of success for each successive group. Needs to be the same size
        as n_total and expected
    n_total: list or numpy array, 1-D
        A list containing the total count of each successive group. Needs to be the same size as success_prob and
        expected
    expected: (optional) list or numpy array, 1-D
        If None, then expected is the weighted average of success_prob
        Else, a list containing the expected probabilities of each success group. Needs to be the same size as success_prob
        and n_total.

    Returns
    -------
    g: float
        Our measure of the difference between our observed and expected results
    p: float, 0 <= p <= 1
        The likelihood that we would observe these differences if each group was sampled from the same population
    """
    success_prob, n_total = _check_table(
        success_prob, only_count=False), _check_table(n_total, only_count=True)
    if len(success_prob) != len(n_total):
        raise ValueError(
            "Success probability and N Total are not of same length")
    if expected is None:
        expected = np.sum(success_prob * n_total) / np.sum(n_total)
    else:
        expected = _check_table(expected, only_count=False)
        if len(expected) != len(success_prob):
            raise ValueError(
                "Expected and Success probability are not of same length")
        if not np.all(expected < 1):
            raise ValueError(
                "Cannot have percentage of expected greater than 1")
        elif not np.all(expected >= 0):
            raise ValueError("Cannot have negative percentage of expected")
    if not np.all(success_prob < 1):
        raise ValueError("Cannot have percentage of success greater than 1")
    elif not np.all(success_prob >= 0):
        raise ValueError("Cannot have negative percentage of success")
    n_success = success_prob * n_total
    n_failure = n_total - n_success
    n_expected_success = expected * n_total
    n_expected_failure = (1 - expected) * n_total
    df = len(n_total) - 1
    g = 2 * (np.sum(n_success * np.log(n_success / n_expected_success)) +
             np.sum(n_failure * np.log(n_failure / n_expected_failure)))
    p = 1 - chi2.cdf(g, df)
    return g, p
 def calculate_c(data_1, data_2):
     data_1, data_2 = _check_table(data_1, only_count=False), _check_table(data_2, only_count=False)
     all_data = np.concatenate([data_1, data_2])
     rank_data = rankdata(all_data)
     n, n_1, n_2 = len(all_data), len(data_1), len(data_2)
     r_1 = rank_data[:n_1]
     u = (6 * np.sum(np.power(r_1, 2)) - n_1 * (n + 1) * (2 * n + 1)) / sqrt(n_1 * n_2 * (n + 1) * (2 * n + 1) * (8 * n + 11) / 5)
     v = (6 * np.sum(np.power(n + 1 - rank_data, 2)) - n_1 * (n + 1) * (2 * n + 1)) / sqrt(n_1 * n_2 * (n + 1) * (2 * n + 1) * (8 * n + 11) / 5)
     rho = 2 * (pow(n, 2) - 4) / ((2 * n + 1) * (8 * n + 11)) - 1
     c = (pow(u, 2) + pow(v, 2) - 2 * rho * u * v) / (2 * (1 - pow(rho, 2)))
     return c
예제 #12
0
def trimmed_means_test(data_1, data_2, p=10, alternative='two-sided'):
    """Not found in scipy.stats or statsmodels.
    Used when we wish to perform a two-sample t-test, but suspect that the data is being heavily influenced by outliers,
    i.e., cannot assume normality.

    Parameters
    ----------
    data_1: list or numpy array, 1-D
        The observed dataset we are comparing to data_2
    data_2: list or numpy array, 1-D
        The observed dataset we are comparing to data_1
    p: float, 0 <= p <= 100
        The percentage of data we wish to drop from each sample
    alternative: str, {two-sided, greater, less}, default is two-sided
        Our alternative hypothesis

    Return
    ------
    t_value: number
        The t statistic for the difference between our datasets
    p: float, 0 <= p <= 1
        The likelihood that the observed differences are due to chance
    """
    if p < 0 or p > 100:
        raise ValueError("Percentage trimmed needs to be between 0 and 100")
    if alternative.casefold() not in ['two-sided', 'greater', 'less']:
        raise ValueError("Cannot determine method for alternative hypothesis")
    data_1, data_2 = _check_table(data_1), _check_table(data_2)
    sort_data_1, sort_data_2 = np.sort(data_1), np.sort(data_2)
    n_1, n_2 = len(data_1) * p // 200, len(data_2) * p // 200
    trim_data_1, trim_data_2 = sort_data_1[
        n_1:len(sort_data_1) - n_1], sort_data_2[n_2:len(sort_data_2) - n_2]
    n_x, n_y = len(data_1), len(data_2)
    m_x, m_y = len(trim_data_1), len(trim_data_2)
    winsor_values_1, winsor_values_2 = np.append(
        trim_data_1[0] * n_1,
        trim_data_1[-1] * n_1), np.append(trim_data_2[0] * n_2,
                                          trim_data_2[-1] * n_2)
    winsor_data_1, winsor_data_2 = np.append(trim_data_1,
                                             winsor_values_1), np.append(
                                                 trim_data_2, winsor_values_2)
    s_x, s_y = np.var(winsor_data_1, ddof=1), np.var(winsor_data_2, ddof=1)
    x_bar, y_bar = np.mean(trim_data_1), np.mean(trim_data_2)
    pooled_var = ((n_x - 1) * s_x + (n_y - 1) * s_y) / ((m_x - 1) + (m_y - 1))
    t_value = (x_bar - y_bar) / np.sqrt(pooled_var * ((1 / m_x) + (1 / m_y)))
    df = m_x + m_y - 2
    p = (1.0 - t.cdf(abs(t_value), df))
    if alternative.casefold() == 'two-sided':
        p *= 2
    elif alternative.casefold() == 'less':
        p = 1 - p
    else:
        pass
    return t_value, p
def two_sample_wilcoxon_test(data_1, data_2, alternative='two-sided', handle_zero='wilcox'):
    """This test can be found in scipy.stats as wilcoxon
    Used when we want to compare two related or paired samples, or repeated measurements, and see if their population
    mean ranks differ. Also used when we cannot assume that the samples are normally distributed.

    Parameters
    ----------
    data_1: list or numpy array, 1-D
        The first sample or repeated measure
    data_2: list or numpy array, 1-D
        The second sample or repeated measure
    alternative: str, {two-sided, greater, less}, default is two-sided
        Our alternative hypothesis
    handle_zero: str, default is wilcox
        How we treat differences of zero. It can be either wilcox (ignore) or pratt

    Return
    ------
    w_value: float
        The W statistic for our observed differences in mean ranks
    p: float, 0 <= p <= 1
        The likelihood that the observed mean rank differences would be found in two datasets sampled from the same
        population
    """
    if alternative.casefold() not in ['two-sided', 'greater', 'less']:
        raise ValueError("Cannot determine method for alternative hypothesis")
    if handle_zero.casefold() not in ['wilcox', 'pratt']:
        raise ValueError("Cannot determine how to handle differences of zero")
    if len(data_1) != len(data_2):
        raise AttributeError("Cannot perform signed wilcoxon test on unpaired data")
    data_1, data_2 = _check_table(data_1, False), _check_table(data_2, False)
    diff = data_1 - data_2
    if handle_zero.casefold() == 'wilcox':
        assert np.sum(diff == 0) != len(data_1), "Cannot perform wilcoxon test when all differences are zero"
        diff = np.compress(np.not_equal(diff, 0), diff)
    n = len(diff)
    abs_diff, sign_diff = np.abs(diff), np.sign(diff)
    rank = rankdata(abs_diff)
    if handle_zero.casefold() == "pratt":
        zero_ranks = np.not_equal(abs_diff, 0)
        sign_diff, rank = np.compress(zero_ranks, sign_diff), np.compress(zero_ranks, rank)
    w_value = np.sum(sign_diff * rank)
    std = sqrt(n * (n + 1) * (2 * n + 1) / 6)
    z_score = w_value / std
    if alternative.casefold() == 'two-sided':
        p = 2 * (1 - norm.cdf(abs(z_score)))
    elif alternative.casefold() == 'greater':
        p = 1 - norm.cdf(z_score)
    else:
        p = norm.cdf(z_score)
    return w_value, p
예제 #14
0
def binomial_sign_test(data_1,
                       data_2,
                       alternative='two-sided',
                       success_prob=0.5):
    """Found in scipy as sign_test
    Used to determine whether or not the measured differences between two groups (X and Y) is
    significantly greater and/or less than each other. For instance, we might use this to determine if the weight loss
    for users who followed a certain diet is significant or not.

    Parameters
    ----------
    data_1: list or numpy array, 1-D
        A list of all observations for group X.
    data_2: list or numpy array, 1-D
        A list of all observations for group Y.
    alternative: str, {two-sided, greater, less}, default is two-sided
        Our alternative hypothesis
    success_prob: float, 0 <= success_prob <= 1
        The probability of success. Default is 0.5

    Returns
    -------
    p: float, 0 <= p <= 1
        The probability that our observed differences would happen under a binomial distribution, assuming the given
        success probability.
    """
    if not isinstance(alternative, str):
        raise TypeError("Alternative Hypothesis is not of string type")
    if alternative.casefold() not in ['two-sided', 'greater', 'less']:
        raise ValueError("Cannot determine method for alternative hypothesis")
    if len(data_1) != len(data_2):
        raise AttributeError("The two data sets are not paired data sets")
    if not isinstance(success_prob, float):
        raise TypeError("Probability of success needs to be a decimal value")
    if success_prob > 1 or success_prob < 0:
        raise ValueError(
            "Cannot calculate probability of success, needs to be between 0 and 1"
        )
    data_1, data_2 = _check_table(data_1), _check_table(data_2)
    diff = data_1 - data_2
    pos_diff, neg_diff = np.sum(diff > 0), np.sum(diff < 0)
    total = pos_diff + neg_diff
    if alternative.casefold() == 'greater':
        p = _right_extreme(pos_diff, total, success_prob)
    elif alternative.casefold() == 'less':
        p = _left_extreme(pos_diff, total, success_prob)
    else:
        p = _left_extreme(neg_diff, total, success_prob) + _right_extreme(
            pos_diff, total, success_prob)
    return p
def peirce_test(observed, expected, num_outliers=1, num_coef=1):
    """Not found in either scipy.stats or statsmodels

    Parameters
    ----------
    observed: list or numpy array
        Our observed observations
    expected: list or numpy array
        Our expected observations, or what the model outputted for "Y"
    num_outliers: int, default is 1
        The number of outliers we are trying to identify.
    num_coef: int, default is 1
        The number of regression variables we are thinking of including

    Returns
    -------
    An array, containing all values that we found to be an outlier according to Peirce's criteria.
    """
    if not isinstance(num_outliers, int):
        raise TypeError("Number of outliers needs to be an integer")
    if num_outliers < 0:
        raise ValueError("Number of outliers has to be a positive value")
    if not isinstance(num_coef, int):
        raise TypeError("Number of regression coefficients needs to be an integer")
    if num_coef < 0:
        raise ValueError("Number of regression coefficients has to be a positive value")
    observed, expected = _check_table(observed), _check_table(expected)
    if len(observed) != len(expected):
        raise ValueError("Length of observed and expected need to be the same")
    n = len(observed)
    if num_outliers > n:
        raise ValueError("Cannot have number of outliers greater than number of observations")
    if num_coef > n:
        raise Warning("Number of regressor variables is greater than number of observations")
    q = pow(num_outliers, num_outliers / n) * pow(n - num_outliers, (n - num_outliers) / n) / n
    r_new, r_old = 1.0, 0.0
    while abs(r_new - r_old) > (n * 2.0e-16):
        ldiv = pow(r_new, num_outliers) if pow(r_new, num_outliers) != 0 else 1.0e-6
        lambda1 = pow(q, n) / pow(ldiv, 1 / (n - num_coef))
        x2 = 1 + (n - num_coef - num_outliers) / (num_outliers * (1.0 - pow(lambda1, 2)))
        if x2 < 0:
            x2 = 0.0
            r_old = r_new
        else:
            r_old = r_new
            r_new = np.exp((x2 - 1) / 2.0) * erfc(np.sqrt(x2 / 2))
    mean_squared_error = np.sum(np.power(observed - expected, 2)) / n
    threshold = x2 * mean_squared_error
    return observed[np.power(observed - expected, 2) > threshold]
def two_sample_mann_whitney_test(data_1, data_2, alternative='two-sided'):
    """This test can be found in scipy.stats as mannwhitneyu
    Used when we want to test whether or not the distribution of two ordinal response variables are equal or not,
    assuming that each sample is independent of one another.

    Parameters
    ----------
    data_1: list or numpy array, 1-D
        The observed sample for ordinal response variable 1
    data_2: list or numpy array, 1-D
        The observed sample for ordinal response variable 2
    alternative: str, {two-sided, greater, less}, default is two-sided
        Our alternative hypothesis

    Return
    ------
    u: float
        The U statistic for our observed differences in the two ordinal responses
    p: float, 0 <= p <= 1
        The likelihood that the observed differences are due to chance
    """
    if alternative.casefold() not in ['two-sided', 'greater', 'less']:
        raise ValueError("Cannot determine method for alternative hypothesis")
    data_1, data_2 = _check_table(data_1, False), _check_table(data_2, False)
    combined_data = rankdata(np.concatenate([data_1, data_2]))
    combined_data_len = len(combined_data)
    data_1_len, data_2_len = len(data_1), len(data_2)
    data_1_rank = np.sum(combined_data[:len(data_1)])
    data_2_rank = np.sum(combined_data[len(data_1):])
    u1 = data_1_rank - ((data_1_len * (data_1_len + 1)) / 2)
    u2 = data_2_rank - ((data_2_len * (data_2_len + 1)) / 2)
    u_mean = (u1 + u2) / 2
    if alternative.casefold() == 'two-sided':
        u = np.min([u1, u2])
    elif alternative.casefold() == 'greater':
        u = u1
    else:
        u = u2
    T = np.unique(u, return_counts=True)[1]
    sum_T = np.sum(np.power(T, 3) - T) / (combined_data_len * (combined_data_len - 1))
    u_sd = sqrt((data_1_len * data_2_len / 12) * (combined_data_len + 1 - sum_T))
    z_score = (u - u_mean) / u_sd
    if alternative.casefold() == 'two-sided':
        p = 2 * (1 - norm.cdf(abs(z_score)))
    elif alternative.casefold() == 'greater':
        p = 1 - norm.cdf(z_score)
    else:
        p = norm.cdf(z_score)
    return u, p
def bowker_test(cont_table):
    """Found in statsmodels as TableSymmetry or as bowker_symmetry
    Used to test if a given square table is symmetric about the main diagonal

    Parameters
    ----------
    cont_table: list or numpy array, n x n
        A nxn contingency table

    Return
    ------
    x: float
        Our Chi statistic, oor a measure of symmetry for our contingency table
    p: float, 0 <= p <= 1
        The probability that our table isn't symmetric due to chance
    """
    cont_table = _check_table(cont_table, only_count=True)
    n1, n2 = np.shape(cont_table)
    if n1 != n2:
        raise AttributeError("Contingency Table needs to be of a square shape")
    upper_diagonal = np.triu_indices(n1, 1)
    # lower_diagonal = np.tril_indices(n1, -1) The issue with this code is that it doesn't maintain the exact order
    # of a lower triangular matrix compared to np.triu_indices, which we need for our test statistic
    upper_triangle = cont_table[upper_diagonal]
    lower_triangle = cont_table.T[upper_diagonal]
    x = np.sum(
        np.power(lower_triangle - upper_triangle, 2) /
        (upper_triangle + lower_triangle))
    df = n1 * (n1 - 1) / 2
    p = 1 - chi2.cdf(x, df)
    return x, p
예제 #18
0
def brown_forsythe_test(*args):
    """Found in scipy.stats as levene(center='median')
    Used instead of general levene test if we believe our data to be non-normal.

    Parameters
    ----------
    args: list or numpy arrays, 1-D
        The observed variable/observations for each group, organized into lists or numpy array

    Return
    ------
    w: float
        The W statistic, our measure of difference in variability, which is approximately F-distributed.
    p: float, 0 <= p <= 1
        The probability that our observed differences in variances could occur due to random sampling from a population
        of equal variance.
    """
    k = len(args)
    if k < 2:
        raise AttributeError(
            "Need at least two groups to perform a Brown-Forsythe Test")
    n_i, z_bar, all_z_ij, z_bar_condensed = [], [], [], []
    for obs in args:
        obs = _check_table(obs, False)
        n_i = np.append(n_i, len(obs))
        z_ij = abs(obs - np.median(obs))
        all_z_ij = np.append(all_z_ij, z_ij)
        z_bar = np.append(z_bar, np.repeat(np.mean(z_ij), len(obs)))
        z_bar_condensed = np.append(z_bar_condensed, np.mean(z_ij))
    scalar = (np.sum(n_i) - k) / (k - 1)
    w = scalar * np.sum(
        n_i * np.power(z_bar_condensed - np.mean(z_bar), 2)) / np.sum(
            np.power(all_z_ij - z_bar, 2))
    p = 1 - f.cdf(w, k - 1, np.sum(n_i) - k)
    return w, p
def thompson_tau_test(data, alpha=0.05):
    """Not found in either scipy.stats or statsmddels.
    Uses the Thompson-Tau criteria to iteratively identify outliers until no more exist.

    Parameters
    ----------
    data: list or numpy array, 1-D
        Our dataset we are evaluating for outliers
    alpha: float, default is 0.05
        Our level of significance for detecting outliers

    Returns
    -------
    outliers_list: list
        A list containing all datapoints that we found to be an outlier by Thompson-Tau's criteria
    """
    data = _check_table(data, only_count=False)
    if alpha < 0 or alpha > 1:
        raise ValueError("Cannot have alpha level greater than 1 or less than 0")
    outlier_exist, outlier_table = True, []
    data_copy = np.copy(data)
    while outlier_exist:
        n, mu, s = len(data_copy), np.mean(data_copy), np.std(data_copy, ddof=1)
        ab_resid = np.abs(data_copy - mu) / s
        rejection = t.isf(alpha / 2, n - 2) * (n - 1) / (sqrt(n) * sqrt(n - 2 + pow(t.isf(alpha / 2, n - 2), 2)))
        is_outlier = ab_resid > rejection
        if np.sum(is_outlier) != 0:
            outlier_table.append(data_copy[np.argsort(ab_resid)][-1:][0])
            data_copy = data_copy[np.argsort(ab_resid)][:-1]
        else:
            outlier_exist = False
    return outlier_table
예제 #20
0
def bartlett_test(*args):
    """Found in scipy.stats as bartlett
    This test is used to determine if multiple samples are from a population of equal variances. Note that this test
    is much more sensitive to data that is non-normal compared to Levene or Brown-Forsythe.

    Parameters
    ----------
    args: list or numpy arrays, 1-D
        The observed measurements for each group, organized into lists or numpy array

    Return
    ------
    X: float
        The Chi statistic, or a measure of the observed difference in variances
    p: float, 0 <= p <= 1
        The probability that our observed differences in variances could occur due to random sampling from a population
        of equal variance.
    """
    k = len(args)
    if k < 2:
        raise AttributeError(
            "Need at least two groups to perform the Bartlett Test")
    n_i, var_i = [], []
    for obs in args:
        obs = _check_table(obs)
        n_i = np.append(n_i, len(obs))
        var_i = np.append(var_i, np.var(obs, ddof=1))
    pooled_variance = np.sum((n_i - 1) * var_i) / (np.sum(n_i) - k)
    top = (np.sum(n_i) - k) * np.log(pooled_variance) - np.sum(
        (n_i - 1) * np.log(var_i))
    bottom = 1 + (1 / (3 * (k - 1))) * (np.sum(1 / (n_i - 1)) -
                                        (1 / (np.sum(n_i) - k)))
    X = top / bottom
    p = 1 - chi2.cdf(X, k - 1)
    return X, p
def skew_test(data):
    """Found in scipy.stats as skewtest.
    Used to determine the likelihood that our sample dataset comes from a normal distribution based on its skewness.

    Parameters
    ----------
    data: list or numpy array, 1-D
        Contains all observations from our sample to measure departure from normality

    Returns
    -------
    z: float
        Our test statistic, or the measure of difference of our skewness compared to a normal distribution
    p: float, 0 <= p <= 1
        The likelihood that we would see the observed differences in skewness from a normal population due
        to chance
    """
    data = _check_table(data, only_count=False)
    if len(data) < 8:
        raise AttributeError("Skew Test is not reliable on datasets with less than 8 observations")
    n = len(data)
    skew = _skew(data)
    y2 = (36 * (n - 7) * (pow(n, 2) + 2 * n - 5)) / ((n - 2) * (n + 5) * (n + 7) * (n + 9))
    u2 = 6 * (n - 2) / ((n + 1) * (n + 3))
    w2 = sqrt(2 * y2 + 4) - 1
    delta = 1 / sqrt(log(sqrt(w2)))
    alpha_2 = 2 / (w2 - 1)
    z = delta * asinh(skew / sqrt(alpha_2 * u2))
    p = 2 * (1 - norm.cdf(abs(z)))
    return z, p
def kurtosis_test(data):
    """Found in scipy.stats as kurtosistest.
    Used to determine the likelihood that our sample dataset comes from a normal distribution based on its kurtosis.

    Parameters
    ----------
    data: list or numpy array, 1-D
        Contains all observations from our sample to measure departure from normality

    Returns
    -------
    z: float
        Our test statistic, or the measure of difference of our kurtosis compared to a normal distribution
    p: float, 0 <= p <= 1
        The likelihood that we would see the observed differences in kurtosis from a normal population due
        to chance
    """
    data = _check_table(data, only_count=False)
    if len(data) < 20:
        raise AttributeError("Kurtosis Test is not reliable on datasets with less than 20 observations")
    n = len(data)
    kurtosis = _kurtosis(data) - 3
    mean_kurt = - 6 / (n + 1)
    var_kurt = 24 * n * (n - 2) * (n - 3) / (pow(n + 1, 2) * (n + 3) * (n + 5))
    skew_kurt = (6 * (pow(n, 2) - 5 * n + 2) / ((n + 7) * (n + 9))) * sqrt(6 * (n + 3) * (n + 5) / (n * (n - 2) * (n - 3)))
    a = 6 + ((8 / skew_kurt) * (2 / skew_kurt + sqrt(1 + 4 / pow(skew_kurt, 2))))
    z_top = 1 - 2 / a
    z_bottom = 1 + ((kurtosis - mean_kurt) / sqrt(var_kurt)) * sqrt(2 / (a - 4))
    z = sqrt(9 * a / 2) * (1 - 2 / (9 * a) - np.sign(z_bottom) * np.power(z_top / abs(z_bottom), 1 / 3.0))
    p = 2 * (1 - norm.cdf(abs(z)))
    return z, p
def box_pierce_test(data, num_lags=None):
    """Found in statsmodels as acorr_ljung(boxpierce=True)
    Used to determine if any group of autocorrelations in a time series dataset are different from zero

    Parameters
    ----------
    data: list or numpy array, 1-D
        The time series dataset we are performing our test on
    num_lags: int or list, default is None
        If int, the maximum number of time lags
        If list, then the series of time lags we are performing
        If None, then use np.arange(1, 11)

    Returns
    -------
    q: float
        The Box-Pierce statistic, or our measure of autocorrelations differing from zero
    p: float, 0 <= p <= 1
        The likelihood that our observed autocorrelations would differ from zero due to chance
    """
    if num_lags is None:
        h_lags = np.arange(1, 11)
    elif isinstance(num_lags, int):
        h_lags = np.arange(1, num_lags + 1)
    elif isinstance(num_lags, list) or isinstance(num_lags, (np.ndarray, np.generic)):
        h_lags = _check_table(num_lags, only_count=False)
    else:
        raise ValueError("Cannot discern number of lags")
    h = np.max(h_lags)
    n = len(data)
    q = n * np.sum(pow(_autocorr(data, h_lags), 2))
    p = 1 - chi2.cdf(q, h)
    return q, p
def mcnemar_test(cont_table):
    """Found in statsmodels as mcnemar
    Used when we have paired nominal data that is organized in a 2x2 contingency table. It is used to test the
    assumption that the marginal column and row probabilities are equal, i.e., that the probability that b and c
    are equivalent.

    Parameters
    ----------
    cont_table: list or numpy array, 2 x 2
        A 2x2 contingency table

    Return
    ------
    chi_squared: float
        Our Chi statistic, or the sum of differences between b and c
    p: float, 0 <= p <= 1
        The probability that b and c aren't equivalent due to chance
    """
    cont_table = _check_table(cont_table, True)
    if cont_table.shape != (2, 2):
        raise AttributeError(
            "McNemar's Test is meant for a 2x2 contingency table")
    b, c = cont_table[0, 1], cont_table[1, 0]
    if b + c > 25:
        chi_squared = pow(abs(b - c) - 1, 2) / (b + c)
        p = 1 - chi2.cdf(chi_squared, 1)
    else:
        chi_squared = min(b, c)
        p = 2 * binom.cdf(chi_squared, b + c, 0.5) - binom.pmf(
            binom.ppf(0.99, b + c, 0.5), b + c, 0.5)
    return chi_squared, p
def chi_squared_test(cont_table):
    """Found in scipy.stats as chi2_contingency.
    Determines the difference between what we expect the count of a group to be versus what what was observed in our
    contingency table. Assuming our data follows a chi distribution (i.e., observations are independent), if the observed
    variances are found to be very high given the number of observations, then we reject our null hypothesis and
    conclude that this difference could not occur due to chance.

    Parameters
    ----------
    cont_table: list or numpy array, 2 x 2
        A contingency table containing 2 counts of 2, or 4 counts total. As an example of expected output, refer to a
        confusion matrix for predicting a binary variable.

    Return
    ------
    X: float
        The Chi test statistic, or the variance of the difference of our observed results versus expected results.
    p: float, 0 <= p <= 1
        The likelihood that we would observe our X value given the number of observations we had.
    """
    cont_table = _check_table(cont_table, only_count=True)
    df = (cont_table.shape[0] - 1) * (cont_table.shape[1] - 1)
    row_sum, col_sum = np.sum(cont_table, axis=1), np.sum(cont_table, axis=0)
    expected = np.matmul(np.transpose(row_sum[np.newaxis]),
                         col_sum[np.newaxis]) / np.sum(row_sum)
    X = np.sum(pow(cont_table - expected, 2) / expected)
    p = 1 - chi2.cdf(X, df)
    return X, p
def g_test(cont_table):
    """Found in scipy.stats as chi2_contingency(lambda_="log-likelihood")
    A likelihood ratio test used for determine if the difference between our observed results and expected results in
    our contingency table are likely to happen due to chance.

    Parameters
    ----------
    cont_table: list or numpy array, 2 x 2
        A contingency table containing 2 counts of 2, or 4 counts total. As an example of expected output, refer to a
        confusion matrix for predicting a binary variable.

    Return
    ------
    g: float
        The G statistic, or the likelihood ratio of the difference between observed and expected
    p: float, 0 <= p <= 1
        The likelihood that our observed differences are due to chance
    """
    cont_table = _check_table(cont_table, True)
    df = (cont_table.shape[0] - 1) * (cont_table.shape[1] - 1)
    row_sum, col_sum = np.sum(cont_table, axis=1), np.sum(cont_table, axis=0)
    expected = np.matmul(np.transpose(row_sum[np.newaxis]),
                         col_sum[np.newaxis]) / np.sum(row_sum)
    g = 2 * np.sum(cont_table * np.log(cont_table / expected))
    p = 1 - chi2.cdf(g, df)
    return g, p
예제 #27
0
def fligner_policello_test(data_1, data_2, alternative='two-sided'):
    """Not found in either scipy.stats or statsmodels.
    Used to determine whether the population medians corresponding to two independent samples are equal.

    Parameters
    ----------
    data_1: list or numpy array, 1-D
        The observed measurements for our first sample
    data_2: list or numpy array, 1-D
        The observed measurements for our first sample
    alternative: str, {two-sided, greater, less}, default is two-sided
        Our alternative hypothesis

    Returns
    -------
    z: float
        The z-score of our observed median differences
    p: float, 0 <= p <= 1
        The likelihood that we would observe these differences due to chance
    """
    data_1, data_2 = _check_table(data_1), _check_table(data_2)
    if alternative.casefold() not in ['two-sided', 'greater', 'less']:
        raise ValueError("Cannot determine alternative hypothesis")
    m, n = len(data_1), len(data_2)
    if m < 12 or n < 12:
        warnings.warn(
            "Datasets may be too small for accurate approximation of p")

    def compare_points(x, y):
        z = x - y[:, None]
        z = np.where(z > 0, 1, z)
        z = np.where(z == 0, 0.5, z)
        z = np.where(z < 0, 0, z)
        return np.sum(z, axis=0)

    n_x, n_y = compare_points(data_1, data_2), compare_points(data_2, data_1)
    Nx, Ny = np.sum(n_x), np.sum(n_y)
    m_x, m_y = np.mean(n_x), np.mean(n_y)
    ss_x, ss_y = np.sum(np.power(n_x - m_x, 2)), np.sum(np.power(n_y - m_y, 2))
    z = (Ny - Nx) / (2 * np.sqrt(ss_x + ss_y - (m_x * m_y)))
    if alternative.casefold() == 'two-sided':
        p = 2 * (1 - norm.cdf(abs(z)))
    elif alternative.casefold() == 'greater':
        p = 1 - norm.cdf(z)
    else:
        p = norm.cdf(z)
    return z, p
예제 #28
0
def two_sample_proportion_z_test(data_1, data_2, alternative='two-sided'):
    """Found in statsmodels as proportions_ztest
    Used when we are comparing whether or not two proportion means are the same, given that both of them come from a
    normal distribution.

    Parameters
    ----------
    data_1: list or numpy array, must be binary, 1-D
        An array containing all observations, marked as a 0 for failure and a 1 for success, that we are comparing to
        data_2
    data_2: list or numpy array, must be binary, 1-D
        An array containing all observations, marked as a 0 for failure and a 1 for success, that we are comparing to
        data_1
    alternative: str, default is two-sided
        Our alternative hypothesis. It can be two-sided, less or greater

    Return
    ------
    z_score: float
        Our z-statistic to analyze the likelihood that our observed difference is due to chance
    p: float, 0 <= p <= 1
        The probability that the differences between two samples, assuming a normal distribution, is due to chance
    """
    data_1, data_2 = _check_table(data_1), _check_table(data_2)
    if not np.array_equal(data_1, data_1.astype(bool)):
        raise AttributeError(
            "Cannot perform a proportion test on non-binary data for data_1")
    if not np.array_equal(data_2, data_2.astype(bool)):
        raise AttributeError(
            "Cannot perform a proportion test on non-binary data for data_2")
    if not isinstance(alternative, str):
        raise TypeError("Alternative Hypothesis is not of string type")
    if alternative.casefold() not in ['two-sided', 'greater', 'less']:
        raise ValueError("Cannot determine method for alternative hypothesis")
    n_1, n_2 = len(data_1), len(data_2)
    p_1, p_2 = np.mean(data_1), np.mean(data_2)
    p = (p_1 * n_1 + p_2 * n_2) / (n_1 + n_2)
    q = 1 - p
    se = sqrt((p * q) * ((1 / n_1) + (1 / n_2)))
    z_score = (p_1 - p_2) / se
    if alternative.casefold() == 'two-sided':
        p = 2 * (1 - norm.cdf(abs(z_score)))
    elif alternative.casefold() == 'greater':
        p = 1 - norm.cdf(z_score)
    else:
        p = norm.cdf(z_score)
    return z_score, p
예제 #29
0
def trinomial_test(data_1, data_2, alternative='two-sided'):
    """Not found in scipy.stats or statsmodels
    Used on paired-data when the sign test loses power, that is, when there exists instances of "zero observations" or
    differences of zero between the paired-data.

    Parameters
    ----------
    data_1: list or numpy array, 1-D
        The observed measurements for our first sample
    data_2: list or numpy array, 1-D
        The observed measurements for our first sample

    Returns
    -------
    d: int
        The number of positive instances minus the number of negative instances
    p: float, 0 <= p <= 1
        The likelihood that we would observe these sign differences due to random chance
    """
    data_1, data_2 = _check_table(data_1), _check_table(data_2)
    if len(data_1) != len(data_2):
        raise AttributeError("Cannot perform Trinomial Test on unpaired data")
    if alternative.casefold() not in ['two-sided', 'greater', 'less']:
        raise ValueError("Cannot determine alternative hypothesis")
    n = len(data_1)
    diffs = data_1 - data_2
    pos_diff, neg_diff, zero_diff = np.sum(diffs > 0), np.sum(
        diffs < 0), np.sum(diffs == 0)
    p_0 = zero_diff / n
    probs = []

    def calculate_probs(n, z, k, p_0):
        return np.sum(factorial(n) / (st_factorial(n - z - 2 * k) * st_factorial(k + z) * st_factorial(k)) * \
                      np.power(p_0, n - z - (2 * k)) * np.power((1 - p_0) / 2, z + 2 * k))

    for z in range(n + 1):
        k = np.arange(0, (n - z) // 2 + 1)
        probs.append(calculate_probs(n, z, k, p_0))
    d = pos_diff - neg_diff
    if alternative.casefold() == "two-sided":
        p = np.sum(probs[abs(d):]) * 2
    elif alternative.casefold() == 'greater':
        p = np.sum(probs[abs(d):])
    else:
        p = np.sum(probs[:abs(d)])
    return d, p
def mood_test(data_1, data_2, alternative='two-sided'):
    """Found in scipy.stats as mood
    Used to measure the level of dispersion (difference from median) of the ranks of the two datasets.

    Parameters
    ----------
    data_1: list or numpy array, 1-D
        A list or array containing all observations from our first dataset
    data_2: list or numpy array, 1-D
        A list or array containing all observations from our second dataset
    alternative: str, {two-sided, greater, less}, default is two-sided
        Our alternative hypothesis

    Returns
    -------
    z: float
        Our test statistic that measures the degree of normality of the rank dispersions
    p: float, 0 <= p <= 1
        The likelihood that our rank dispersion would occur from two datasets drawn from the same
        distribution
    """
    data_1, data_2 = _check_table(data_1, only_count=False), _check_table(data_2, only_count=False)
    if alternative.casefold() not in ['two-sided', 'greater', 'less']:
        raise ValueError("Cannot determine method for alternative hypothesis")
    len_1, len_2 = len(data_1), len(data_2)
    n_obs = len_1 + len_2
    if n_obs < 3:
        raise AttributeError("Not enough observations to perform mood dispertion test")
    all_data = np.concatenate([data_1, data_2])
    rank_data = rankdata(all_data)
    r_1 = rank_data[:len_1]
    m = np.sum(np.power(r_1 - (n_obs + 1) / 2, 2))
    mu_m = len_1 * (pow(n_obs, 2) - 1) / 12
    var_m = len_1 * len_2 * (n_obs + 1) * (n_obs + 2) * (n_obs - 2) / 180
    z = (m - mu_m) / sqrt(var_m)
    if alternative.casefold() == 'two-sided':
        if z > 0:
            p = 2 * (1 - norm.cdf(z))
        else:
            p = 2 * norm.cdf(z)
    elif alternative.casefold() == 'greater':
        p = 1 - norm.cdf(z)
    else:
        p = norm.cdf(z)
    return z, p