Ejemplo n.º 1
0
	def pooled_variance(distribution1, distribution2, verbose=False):
		"""
		Get the pooled variance of the distribution, where the sample sizes are not similar.

		Parameters
		----------
		> distribution1: an array of integers containing the distribution values of the first sample
		> distribution2: an array of integers containing the distribution values of the second sample
		> verbose (optional): a boolean that prints means and sum of squares of the samples before returning pooled variance if `True`; 
							print nothing if `False`

		Returns
		-------
		The pooled variance of the samples
		"""

		xbar1 = get_mean(distribution1)
		squares1 = [(xi - xbar1)**2 for xi in distribution1]
		ssx = sum(squares1)

		xbar2 = get_mean(distribution2)
		squares2 = [(xi - xbar2)**2 for xi in distribution2]
		ssy = sum(squares2)

		n1 = len(distribution1)
		n2 = len(distribution2)

		if verbose:
			print(f"Mean of sample 1: {xbar1}")
			print(f"Sum of squares for sample 1: {ssx}")
			print(f"Mean of sample 2: {xbar2}")
			print(f"Sum of squares for sample 2: {ssy}")

		return (ssx + ssy) / (get_dof(n1) + get_dof(n2))
Ejemplo n.º 2
0
def get_y_intercept(x_dist, y_dist, r):
	"""
	y = mx + c => c = y - mx = ybar - r(sy/sx)xbar
	"""

	ybar = get_mean(y_dist)
	xbar = get_mean(x_dist)
	sy = bessel_correction(y_dist)['Sample SD']
	sx = bessel_correction(x_dist)['Sample SD']
	m = get_slope(r, sy, sx)

	return ybar - m * xbar
Ejemplo n.º 3
0
def get_dependent_stats(x1, x2):
	"""
	Get the following statistics for two dependent distributions.
	1. First sample and sample mean
	2. Second sample and sample mean
	3. Difference of both the samples and difference mean

	Parameters
	----------
	> x1: an array containing elements of the first distribution
	> x2: an array containing elements of the second distribution

	Returns a dictionary with the following key-value pairs.
	{
		'first_sample': x1,  # the array `x1`
		'first_sample_mean': mean1,  # mean of the first sample
		'second_sample': x2,  # the array `x2`
		'second_sample_mean': mean2,  # mean of the second sample
		'difference': D,  # an array containing the differences between each corresponding element of `x1` and `x2`
		'mean_difference': mean_diff  # mean of the above `difference` array
	}
	"""

	D = []
	l = len(x1)
	for i in range(l):
		D.append(x2[i] - x1[i])

	mean1 = get_mean(x1)
	mean2 = get_mean(x2)
	mean_diff = get_mean(D)

	return ({
		"first_sample": x1,
		"first_sample_mean": mean1,
		"second_sample": x2,
		"second_sample_mean": mean2,
		"difference": D,
		"mean_difference": mean_diff
	})
Ejemplo n.º 4
0
def sum_squared_between(samples):
	"""
	Get the sum of squares for between-group variability of the samples.

	Parameter
	---------
	> `samples`: a tuple of lists, where each list is a sample containing all the values of that sample

	Returns
	-------
	The sum of squares for between-group variability.
	"""

	xbarG = get_grand_mean(samples)  # grand mean
	ss = 0  # sum of squares for between-group variability
	for sample in samples:
		xbarK = get_mean(sample)
		n = len(sample)
		ss += n * ((xbarK - xbarG) ** 2)
	return ss
Ejemplo n.º 5
0
def honestly_significant_samples(samples, q_critical, verbose=True):
	"""
	Get / print the honestly significant samples among the tuple of samples.
	Assumption: All samples have the same size.

	Parameters
	----------
	> `samples`: a tuple of lists, where each list is a sample containing all the values of that sample
	> `q_critical`: The Studentized Range Statistic at a certain alpha level
	> `verbose`: a `bool` that governs whether or not the indices of significantly different samples be printed (defaulted to `True`)

	Returns
	-------
	A list tuples where each tuple contains a pair of honestly significant means.
	"""

	ms_with = ms_within(samples)
	n = len(samples[0])
	# all samples must have the same size
	k = len(samples)
	for i in range(1, k):
		if not len(samples[i]) == n:
			raise "Samples do not have the same size"
	THSD = tukey_HSD(q_critical, ms_with, n)  # Tukey's HSD

	means = [get_mean(sample) for sample in samples]
	significantly_different_means = []

	for i in range(k - 1):
		m1 = means[i]
		for j in range(i+1, k):
			m2 = means[j]
			diff = m1 - m2
			if diff < 0: diff = -1 * diff  # difference should always be +ve
			if diff > THSD:
				significantly_different_means.append((m1, m2))
				if verbose:
					print(f"Means of samples indexed {i} and {j} are honestly significantly different")
	
	return significantly_different_means
Ejemplo n.º 6
0
# t-tests

## t-statistic
xbar = 6.47
s = 0.4
n = 500
mu0 = 6.07
t = get_t_stat(xbar, mu0, None, s, n)
print(f"t-statistic for these parameters is {t}")

print()

males = [41, 56, 82, 39, 3, 55, 70, 32, 46, 28, 39, 38, 47, 44, 45, 43, 28, 43, 56, 56, 33, 68, 49, 17, 40, 2, 28, 35, 27, 39, 46, 33, 30, 72, 28, 52, 47, 50, 25, 39]
famles = [93, 40, 36, 62, 52, 59, 59, 37, 58, 45, 33, 43, 32, 37, 51, 84, 30, 72, 63, 42, 60, 30, 29 ,52, 58, 50, 56, 42]
SE = 4.01
t = get_t_stat(get_mean(males), get_mean(famles), SE)
print(f"t for quiz = {t}")

print()

## t-critical
alpha = 0.05
dof = 12
n = 30
t_critical = get_t_critical(get_dof(n), alpha, tails=2)
print(f"t-critical value for alpha level {alpha} and sample size {n} = {t_critical}")

print()

## t-test
if t_test(t, t_critical):