def two_sample_t_test(data_1, data_2, alternative='two-sided', paired=False): """This test can be found in scipy.stats as either ttest_rel or ttest_ind Used when we want to compare the distributions of two samples, and while we assume that they both follow a normal distribution, their sample size is too small to reliably use a z-test. Parameters ---------- data_1: list or numpy array, 1-D The observed dataset we are comparing to data_2 data_2: list or numpy array, 1-D The observed dataset we are comparing to data_1 alternative: str, {two-sided, greater, less}, default is two-sided Our alternative hypothesis paired: bool, default is False Whether or not data_1 and data_2 are paired observations Return ------ t_value: number The t statistic for the difference between our datasets p: float, 0 <= p <= 1 The likelihood that the observed differences are due to chance """ if not isinstance(alternative, str): raise TypeError("Alternative Hypothesis is not of string type") if alternative.casefold() not in ['two-sided', 'greater', 'less']: raise ValueError("Cannot determine method for alternative hypothesis") data_1, data_2 = _check_table(data_1, False), _check_table(data_2, False) data_1_mean, data_2_mean = np.mean(data_1), np.mean(data_2) if paired: """This test can be found in scipy.stats as ttest_rel""" if len(data_1) != len(data_2): raise AttributeError("The data types are not paired") n = len(data_1) df = n - 1 squared_difference = np.sum(np.power(data_1 - data_2, 2)) difference = np.sum(data_1 - data_2) std = sqrt((squared_difference - (np.power(difference, 2) / n)) / df) standard_error_difference = _standard_error(std, n) else: # We perform the Welch T-Test due to assumption that variances are not equal """This test can be found in scipy.stats as ttest_ind""" data_1_var, data_2_var = np.var(data_1, ddof=1), np.var(data_2, ddof=1) data_1_n, data_2_n = len(data_1), len(data_2) df = np.power((data_1_var / data_1_n) + (data_2_var / data_2_n), 2) /\ ((np.power(data_1_var, 2) / (np.power(data_1_n, 2) * data_1_n - 1)) + (np.power(data_2_var, 2) / (np.power(data_2_n, 2) * data_2_n - 1))) standard_error_difference = sqrt((data_1_var / data_1_n) + (data_2_var / data_2_n)) t_value = (data_1_mean - data_2_mean) / standard_error_difference p = (1.0 - t.cdf(abs(t_value), df)) if alternative.casefold() == 'two-sided': p *= 2 elif alternative.casefold() == 'less': p = 1 - p else: pass return t_value, p
def two_sample_z_test(data_1, data_2, alternative='two-sided'): """This test can be found in statsmodels as ztest_ind Determines the likelihood that the distribution of two data points is significantly different, assuming that both data points are derived from a normal distribution. Parameters ---------- data_1: list or numpy array, 1-D The observed dataset we are comparing to data_2 data_2: list or numpy array, 1-D The observed dataset we are comparing to data_1 alternative: str, {two-sided, greater, less}, default is two-sided Our alternative hypothesis Return ------ z_score: number The Z-score of our observed differences p: float, 0 <= p <= 1 The likelihood that the observed differences from data_1 to data_2 are due to chance """ if not isinstance(alternative, str): raise TypeError("Alternative Hypothesis is not of string type") if alternative.casefold() not in ['two-sided', 'greater', 'less']: raise ValueError("Cannot determine method for alternative hypothesis") if len(data_1) < 30 or len(data_2) < 30: raise AttributeError( "Too few observations for z-test to be reliable, use t-test instead" ) data_1, data_2 = _check_table(data_1, False), _check_table(data_2, False) data_1_mean, data_2_mean = np.mean(data_1), np.mean(data_2) data_1_std, data_2_std = np.std(data_1, ddof=1), np.std(data_2, ddof=1) z_score = (data_1_mean - data_2_mean) / sqrt( _standard_error(data_1_std, len(data_1)) + _standard_error(data_2_std, len(data_2))) if alternative.casefold() == 'two-sided': p = 2 * (1 - norm.cdf(abs(z_score))) elif alternative.casefold() == 'greater': p = 1 - norm.cdf(z_score) else: p = norm.cdf(z_score) return z_score, p
def one_sample_z_test(sample_data, pop_mean, alternative='two-sided'): """This test can be found in statsmodels as ztest Determines the likelihood that our sample mean differs from our population mean, assuming that the data follows a normal distribution. Parameters ---------- sample_data: list or numpy array, 1-D Our observational data pop_mean: float The mean of our population, or what we expect the mean of our sample data to be alternative: str, {two-sided, greater, less}, default is two-sided Our alternative hypothesis Return ------ z_score: float The Z-score of our data p: float, 0 <= p <= 1 The likelihood that our observed data differs from our population mean, assuming a normal distribution, due to chance """ if not isinstance(pop_mean, Number): raise TypeError("Population mean is not of numeric type") if not isinstance(alternative, str): raise TypeError("Alternative Hypothesis is not of string type") if alternative.casefold() not in ['two-sided', 'greater', 'less']: raise ValueError("Cannot determine method for alternative hypothesis") if len(sample_data) < 30: raise AttributeError( "Too few observations for z-test to be reliable, use t-test instead" ) sample_data = _check_table(sample_data, False) sample_mean = np.mean(sample_data) sample_std = np.std(sample_data, ddof=1) z_score = sample_mean - pop_mean / _standard_error(sample_std, len(sample_data)) if alternative.casefold() == 'two-sided': p = 2 * (1 - norm.cdf(abs(z_score))) elif alternative.casefold() == 'greater': p = 1 - norm.cdf(z_score) else: p = norm.cdf(z_score) return z_score, p
def test_standardError_nZero_Error(self): s, n = 10, 0 with pytest.raises(ValueError): utils._standard_error(s, n)
def test_standardError_nFloat_Error(self): s, n = 10, 1.5 with pytest.raises(TypeError): utils._standard_error(s, n)
def test_standardError_Dict2_Error(self): s, n = 10, {'n', 10} with pytest.raises(TypeError): utils._standard_error(s, n)
def test_standardError_List2_Error(self): s, n = 10, 'n' with pytest.raises(TypeError): utils._standard_error(s, n)
def test_standardError_Dict1_Error(self): s, n = {'s': 10}, 10 with pytest.raises(TypeError): utils._standard_error(s, n)
def test_standardError_List1_Error(self): s, n = ['s'], 10 with pytest.raises(TypeError): utils._standard_error(s, n)
def test_standardError_Result(self): s, n = 10, 100 assert pytest.approx(1.0, 0.01) == utils._standard_error(s, n)
def test_standardError_String1_Error(self): s, n = 's', 10 with pytest.raises(TypeError): utils._standard_error(s, n)