def correlation(x, y): stdev_x = standard_deviation(x) stdev_y = standard_deviation(y) if stdev_x > 0 and stdev_y > 0: return covariance(x, y) / stdev_x / stdev_y else: return 0
def correlation(x, y): standard_dev_x = standard_deviation(x) standard_dev_y = standard_deviation(y) if (standard_dev_x is 0 or standard_dev_y is 0): return 0 else: return covariance(x, y) / (standard_deviation(x) * standard_deviation(y))
def correlation(x, y): """ Covariance values are sometimes difficult to interprete. For this reason, correlation is a more common measure. Correlation is always unitless and always lies between -1 (perfect anti-correlation) and 1 (perfect correlation). Correlation is sensitive to outliers. """ stdev_x = standard_deviation(x) stdev_y = standard_deviation(y) if stdev_x > 0 and stdev_y > 0: return covariance(x, y) / stdev_x / stdev_y else: return 0 # if no variation, correlation is zero
def scale(data_matrix): # data matrix e.g. [[63,67,70], [160,170.2,177.8], [150,160,171]] """returns the means and standard deviations of each column""" num_rows, num_cols = shape(data_matrix) means = [mean(get_column(data_matrix, j)) for j in range(num_cols)] stdevs = [standard_deviation(get_column(data_matrix, j)) for j in range(num_cols)] return means, stdevs
# Repeatedly take a bootstrap sample # If coefficient of one of the indpendent vars doesn't vary much across samples, # then we can be confident that our estimate is relatively tight. # If the coefficient varies greatly across samples, then we can't be at all # confident in our estimate. random.seed(0) bootstrap_betas = bootstrap_statistic( list(zip(trainer_party_stats, trainer_badge_counts)), estimate_sample_beta, 10) print('bootstrap betas:') for beta in bootstrap_betas: print('beta = %s' % beta) bootstrap_standard_errors = [ standard_deviation([beta[index] for beta in bootstrap_betas]) for index in range(3) ] print('standard errors: %s' % bootstrap_standard_errors) # We can then evaluate the meaningfulness of the betas # with the following calculations def p_value(beta_hat_j, sigma_hat_j): if beta_hat_j > 0: return 2 * (1 - normal_cdf(beta_hat_j / sigma_hat_j)) else: return 2 * normal_cdf(beta_hat_j / sigma_hat_j)
def least_squares_fit(xs, ys): beta = correlation(xs, ys) * standard_deviation(ys) / standard_deviation(xs) alpha = mean(ys) - beta * mean(xs) return alpha, beta