def pooled_variance(distribution1, distribution2, verbose=False): """ Get the pooled variance of the distribution, where the sample sizes are not similar. Parameters ---------- > distribution1: an array of integers containing the distribution values of the first sample > distribution2: an array of integers containing the distribution values of the second sample > verbose (optional): a boolean that prints means and sum of squares of the samples before returning pooled variance if `True`; print nothing if `False` Returns ------- The pooled variance of the samples """ xbar1 = get_mean(distribution1) squares1 = [(xi - xbar1)**2 for xi in distribution1] ssx = sum(squares1) xbar2 = get_mean(distribution2) squares2 = [(xi - xbar2)**2 for xi in distribution2] ssy = sum(squares2) n1 = len(distribution1) n2 = len(distribution2) if verbose: print(f"Mean of sample 1: {xbar1}") print(f"Sum of squares for sample 1: {ssx}") print(f"Mean of sample 2: {xbar2}") print(f"Sum of squares for sample 2: {ssy}") return (ssx + ssy) / (get_dof(n1) + get_dof(n2))
def get_y_intercept(x_dist, y_dist, r): """ y = mx + c => c = y - mx = ybar - r(sy/sx)xbar """ ybar = get_mean(y_dist) xbar = get_mean(x_dist) sy = bessel_correction(y_dist)['Sample SD'] sx = bessel_correction(x_dist)['Sample SD'] m = get_slope(r, sy, sx) return ybar - m * xbar
def get_dependent_stats(x1, x2): """ Get the following statistics for two dependent distributions. 1. First sample and sample mean 2. Second sample and sample mean 3. Difference of both the samples and difference mean Parameters ---------- > x1: an array containing elements of the first distribution > x2: an array containing elements of the second distribution Returns a dictionary with the following key-value pairs. { 'first_sample': x1, # the array `x1` 'first_sample_mean': mean1, # mean of the first sample 'second_sample': x2, # the array `x2` 'second_sample_mean': mean2, # mean of the second sample 'difference': D, # an array containing the differences between each corresponding element of `x1` and `x2` 'mean_difference': mean_diff # mean of the above `difference` array } """ D = [] l = len(x1) for i in range(l): D.append(x2[i] - x1[i]) mean1 = get_mean(x1) mean2 = get_mean(x2) mean_diff = get_mean(D) return ({ "first_sample": x1, "first_sample_mean": mean1, "second_sample": x2, "second_sample_mean": mean2, "difference": D, "mean_difference": mean_diff })
def sum_squared_between(samples): """ Get the sum of squares for between-group variability of the samples. Parameter --------- > `samples`: a tuple of lists, where each list is a sample containing all the values of that sample Returns ------- The sum of squares for between-group variability. """ xbarG = get_grand_mean(samples) # grand mean ss = 0 # sum of squares for between-group variability for sample in samples: xbarK = get_mean(sample) n = len(sample) ss += n * ((xbarK - xbarG) ** 2) return ss
def honestly_significant_samples(samples, q_critical, verbose=True): """ Get / print the honestly significant samples among the tuple of samples. Assumption: All samples have the same size. Parameters ---------- > `samples`: a tuple of lists, where each list is a sample containing all the values of that sample > `q_critical`: The Studentized Range Statistic at a certain alpha level > `verbose`: a `bool` that governs whether or not the indices of significantly different samples be printed (defaulted to `True`) Returns ------- A list tuples where each tuple contains a pair of honestly significant means. """ ms_with = ms_within(samples) n = len(samples[0]) # all samples must have the same size k = len(samples) for i in range(1, k): if not len(samples[i]) == n: raise "Samples do not have the same size" THSD = tukey_HSD(q_critical, ms_with, n) # Tukey's HSD means = [get_mean(sample) for sample in samples] significantly_different_means = [] for i in range(k - 1): m1 = means[i] for j in range(i+1, k): m2 = means[j] diff = m1 - m2 if diff < 0: diff = -1 * diff # difference should always be +ve if diff > THSD: significantly_different_means.append((m1, m2)) if verbose: print(f"Means of samples indexed {i} and {j} are honestly significantly different") return significantly_different_means
# t-tests ## t-statistic xbar = 6.47 s = 0.4 n = 500 mu0 = 6.07 t = get_t_stat(xbar, mu0, None, s, n) print(f"t-statistic for these parameters is {t}") print() males = [41, 56, 82, 39, 3, 55, 70, 32, 46, 28, 39, 38, 47, 44, 45, 43, 28, 43, 56, 56, 33, 68, 49, 17, 40, 2, 28, 35, 27, 39, 46, 33, 30, 72, 28, 52, 47, 50, 25, 39] famles = [93, 40, 36, 62, 52, 59, 59, 37, 58, 45, 33, 43, 32, 37, 51, 84, 30, 72, 63, 42, 60, 30, 29 ,52, 58, 50, 56, 42] SE = 4.01 t = get_t_stat(get_mean(males), get_mean(famles), SE) print(f"t for quiz = {t}") print() ## t-critical alpha = 0.05 dof = 12 n = 30 t_critical = get_t_critical(get_dof(n), alpha, tails=2) print(f"t-critical value for alpha level {alpha} and sample size {n} = {t_critical}") print() ## t-test if t_test(t, t_critical):