예제 #1
0
def two_sample_t_test(data_1, data_2, alternative='two-sided', paired=False):
    """This test can be found in scipy.stats as either ttest_rel or ttest_ind
    Used when we want to compare the distributions of two samples, and while we assume that they both follow a normal
    distribution, their sample size is too small to reliably use a z-test.

    Parameters
    ----------
    data_1: list or numpy array, 1-D
        The observed dataset we are comparing to data_2
    data_2: list or numpy array, 1-D
        The observed dataset we are comparing to data_1
    alternative: str, {two-sided, greater, less}, default is two-sided
        Our alternative hypothesis
    paired: bool, default is False
        Whether or not data_1 and data_2 are paired observations

    Return
    ------
    t_value: number
        The t statistic for the difference between our datasets
    p: float, 0 <= p <= 1
        The likelihood that the observed differences are due to chance
    """
    if not isinstance(alternative, str):
        raise TypeError("Alternative Hypothesis is not of string type")
    if alternative.casefold() not in ['two-sided', 'greater', 'less']:
        raise ValueError("Cannot determine method for alternative hypothesis")
    data_1, data_2 = _check_table(data_1, False), _check_table(data_2, False)
    data_1_mean, data_2_mean = np.mean(data_1), np.mean(data_2)
    if paired:
        """This test can be found in scipy.stats as ttest_rel"""
        if len(data_1) != len(data_2):
            raise AttributeError("The data types are not paired")
        n = len(data_1)
        df = n - 1
        squared_difference = np.sum(np.power(data_1 - data_2, 2))
        difference = np.sum(data_1 - data_2)
        std = sqrt((squared_difference - (np.power(difference, 2) / n)) / df)
        standard_error_difference = _standard_error(std, n)

    else:
        # We perform the Welch T-Test due to assumption that variances are not equal
        """This test can be found in scipy.stats as ttest_ind"""
        data_1_var, data_2_var = np.var(data_1, ddof=1), np.var(data_2, ddof=1)
        data_1_n, data_2_n = len(data_1), len(data_2)
        df = np.power((data_1_var / data_1_n) + (data_2_var / data_2_n), 2) /\
             ((np.power(data_1_var, 2) / (np.power(data_1_n, 2) * data_1_n - 1)) +
              (np.power(data_2_var, 2) / (np.power(data_2_n, 2) * data_2_n - 1)))
        standard_error_difference = sqrt((data_1_var / data_1_n) +
                                         (data_2_var / data_2_n))
    t_value = (data_1_mean - data_2_mean) / standard_error_difference
    p = (1.0 - t.cdf(abs(t_value), df))
    if alternative.casefold() == 'two-sided':
        p *= 2
    elif alternative.casefold() == 'less':
        p = 1 - p
    else:
        pass
    return t_value, p
예제 #2
0
def two_sample_z_test(data_1, data_2, alternative='two-sided'):
    """This test can be found in statsmodels as ztest_ind
    Determines the likelihood that the distribution of two data points is significantly different, assuming that both
    data points are derived from a normal distribution.

    Parameters
    ----------
    data_1: list or numpy array, 1-D
        The observed dataset we are comparing to data_2
    data_2: list or numpy array, 1-D
        The observed dataset we are comparing to data_1
    alternative: str, {two-sided, greater, less}, default is two-sided
        Our alternative hypothesis

    Return
    ------
    z_score: number
        The Z-score of our observed differences
    p: float, 0 <= p <= 1
        The likelihood that the observed differences from data_1 to data_2 are due to chance
    """
    if not isinstance(alternative, str):
        raise TypeError("Alternative Hypothesis is not of string type")
    if alternative.casefold() not in ['two-sided', 'greater', 'less']:
        raise ValueError("Cannot determine method for alternative hypothesis")
    if len(data_1) < 30 or len(data_2) < 30:
        raise AttributeError(
            "Too few observations for z-test to be reliable, use t-test instead"
        )
    data_1, data_2 = _check_table(data_1, False), _check_table(data_2, False)
    data_1_mean, data_2_mean = np.mean(data_1), np.mean(data_2)
    data_1_std, data_2_std = np.std(data_1, ddof=1), np.std(data_2, ddof=1)
    z_score = (data_1_mean - data_2_mean) / sqrt(
        _standard_error(data_1_std, len(data_1)) +
        _standard_error(data_2_std, len(data_2)))
    if alternative.casefold() == 'two-sided':
        p = 2 * (1 - norm.cdf(abs(z_score)))
    elif alternative.casefold() == 'greater':
        p = 1 - norm.cdf(z_score)
    else:
        p = norm.cdf(z_score)
    return z_score, p
예제 #3
0
def one_sample_z_test(sample_data, pop_mean, alternative='two-sided'):
    """This test can be found in statsmodels as ztest
    Determines the likelihood that our sample mean differs from our population mean, assuming that the data follows a
    normal distribution.

    Parameters
    ----------
    sample_data: list or numpy array, 1-D
        Our observational data
    pop_mean: float
        The mean of our population, or what we expect the mean of our sample data to be
    alternative: str, {two-sided, greater, less}, default is two-sided
        Our alternative hypothesis

    Return
    ------
    z_score: float
        The Z-score of our data
    p: float, 0 <= p <= 1
        The likelihood that our observed data differs from our population mean, assuming a normal distribution, due to
        chance
    """
    if not isinstance(pop_mean, Number):
        raise TypeError("Population mean is not of numeric type")
    if not isinstance(alternative, str):
        raise TypeError("Alternative Hypothesis is not of string type")
    if alternative.casefold() not in ['two-sided', 'greater', 'less']:
        raise ValueError("Cannot determine method for alternative hypothesis")
    if len(sample_data) < 30:
        raise AttributeError(
            "Too few observations for z-test to be reliable, use t-test instead"
        )
    sample_data = _check_table(sample_data, False)
    sample_mean = np.mean(sample_data)
    sample_std = np.std(sample_data, ddof=1)
    z_score = sample_mean - pop_mean / _standard_error(sample_std,
                                                       len(sample_data))
    if alternative.casefold() == 'two-sided':
        p = 2 * (1 - norm.cdf(abs(z_score)))
    elif alternative.casefold() == 'greater':
        p = 1 - norm.cdf(z_score)
    else:
        p = norm.cdf(z_score)
    return z_score, p
예제 #4
0
 def test_standardError_nZero_Error(self):
     s, n = 10, 0
     with pytest.raises(ValueError):
         utils._standard_error(s, n)
예제 #5
0
 def test_standardError_nFloat_Error(self):
     s, n = 10, 1.5
     with pytest.raises(TypeError):
         utils._standard_error(s, n)
예제 #6
0
 def test_standardError_Dict2_Error(self):
     s, n = 10, {'n', 10}
     with pytest.raises(TypeError):
         utils._standard_error(s, n)
예제 #7
0
 def test_standardError_List2_Error(self):
     s, n = 10, 'n'
     with pytest.raises(TypeError):
         utils._standard_error(s, n)
예제 #8
0
 def test_standardError_Dict1_Error(self):
     s, n = {'s': 10}, 10
     with pytest.raises(TypeError):
         utils._standard_error(s, n)
예제 #9
0
 def test_standardError_List1_Error(self):
     s, n = ['s'], 10
     with pytest.raises(TypeError):
         utils._standard_error(s, n)
예제 #10
0
 def test_standardError_Result(self):
     s, n = 10, 100
     assert pytest.approx(1.0, 0.01) == utils._standard_error(s, n)
예제 #11
0
 def test_standardError_String1_Error(self):
     s, n = 's', 10
     with pytest.raises(TypeError):
         utils._standard_error(s, n)