def sort_by_standard_deviation(): global data_matrix global ALL_count global snr_tuples global progressbar_total sample_length = len(data_matrix[0]) sample_count = len(data_matrix) for attribute_index in range(1, sample_length): attribute_list = list() for sample_index in range(sample_count): attribute_list.append(data_matrix[sample_index][attribute_index]) sd_value = statistics.standard_deviation( attribute_list[:ALL_count]) + statistics.standard_deviation(attribute_list[(ALL_count + 1):]) rounded_sd = math.ceil(sd_value * 10000) / 10000 snr_tuples.append((attribute_index, rounded_sd)) sort.randomized_quick_sort_for_tuples(snr_tuples, 0, len(snr_tuples) - 1) progressbar.show(7, progressbar_total, prefix='Progress:', suffix='Complete', length=50)
def least_squares_fit(x: Vector, y: Vector) -> Tuple[float, float]: """Given two vectors x and y, find the least-squares value of alpha and beta""" beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x) alpha = mean(y) - beta * mean(x) #print(alpha, beta) return alpha, beta
def sort_by_standard_deviation(): global transposed_raw_data_matrix global ALL_count global MLL_count global AML_count global snr_tuples global progressbar_total sample_length = len(transposed_raw_data_matrix[0]) sample_count = len(transposed_raw_data_matrix) MLL_end_index = ALL_count + MLL_count for attribute_index in range(sample_length - 1): attribute_list = list() for sample_index in range(sample_count): attribute_list.append( transposed_raw_data_matrix[sample_index][attribute_index]) sd_value = statistics.standard_deviation( attribute_list[:ALL_count]) + statistics.standard_deviation( attribute_list[ALL_count:MLL_end_index] ) + statistics.standard_deviation(attribute_list[MLL_end_index:]) rounded_sd = math.ceil(sd_value * 10000) / 10000 snr_tuples.append((attribute_index, rounded_sd)) sort.randomized_quick_sort_for_tuples(snr_tuples, 0, len(snr_tuples) - 1) progressbar.show(9, progressbar_total, prefix="Progress:", suffix="Complete", length=50)
def least_squares_fit(x, y): """given training values for x and y, find the least-squares values of alpha and beta""" #x的系数β=xy相关系数*y的标准差/x的标准差 beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x) alpha = mean(y) - beta * mean(x) return alpha, beta
def print_component_statistics_old(n_components_to_show=N_COMPONENTS_TO_SHOW): print("component statistics:\n") for i in range(n_components_to_show): print("component " + str(i) + ":") like_comp = likesByComponent[i] dislike_comp = dislikesByComponent[i] print("means: like = " + str(mean(like_comp)) + " dislike = " + str(mean(dislike_comp))) print( "medians: like = " + str(median(like_comp)) + " dislike = " + str(median(dislike_comp))) print("stdevs: like = " + str(standard_deviation(like_comp)) + " dislike = " + str( standard_deviation(dislike_comp))) print("interquartile range: like = " + str(interquartile_range(like_comp)) + " dislike = " + str( interquartile_range(dislike_comp))) print("\n")
def click_button_add_and_close(self, window, choice_of_calculation): if self.check_empty_combobox_graph(): return analyzed_model = self.get_model() analysis_model = self.get_analysis(analyzed_model) if choice_of_calculation == 1: statistics.check_stationarity_click_button(analysis_model) if choice_of_calculation == 2: statistics.average_value_click_button(analysis_model) if choice_of_calculation == 3: statistics.dispersion_click_button(analysis_model) if choice_of_calculation == 4: statistics.dispersion_x_10_click_button(analysis_model) if choice_of_calculation == 5: statistics.standard_deviation(analysis_model) if choice_of_calculation == 6: statistics.asymmetry_click_button(analysis_model) if choice_of_calculation == 7: statistics.asymmetry_coefficient_click_button(analysis_model) if choice_of_calculation == 8: statistics.excess_click_button(analysis_model) if choice_of_calculation == 9: statistics.kurtosis_click_button(analysis_model) if choice_of_calculation == 10: statistics.standard_ratio_click_button(analysis_model) if choice_of_calculation == 11: statistics.mean_absolute_deviation_click_button(analysis_model) if choice_of_calculation == 12: statistics.x_min_click_button(analysis_model) if choice_of_calculation == 13: statistics.x_max_click_button(analysis_model) window.destroy()
def scale(data_matrix): """returns the mean and standard deviation of each column""" num_rows, num_cols = shape(data_matrix) means = [mean(get_column(data_matrix,j)) for j in range(num_cols)] stdevs = [standard_deviation(get_column(data_matrix, j)) for j in range(num_cols)] return means, stdevs
def scale(data_matrix): num_rows, num_cols = shape(data_matrix) means = [mean(get_column(data_matrix,j)) for j in range(num_cols)] stdevs = [standard_deviation(get_column(data_matrix,j)) for j in range(num_cols)] return means, stdevs
def plot_multiple_histories(histories, directory): colors = ['r', 'b', 'g', 'c', 'm', 'y', 'k'] colors_b = ['r--', 'b--', 'g--', 'c--', 'm--', 'y--', 'k--'] plt.clf() fig, ax = plt.subplots() ax.set_xlabel("$Days$") ax.set_ylabel("$Average-Population$") #print(histories) datasets = [[[0] * len(histories) for _ in range(len(next(iter(histories[0].values()))))] for _ in range(len(histories[0]))] labels = [] history = 0 for n in histories: i = 0 for h in n: if history == 0: labels.append(h) j = 0 for val in n[h]: datasets[i][j][history] = val j += 1 i += 1 history += 1 #print(datasets) """ fig1, ax1 = plt.subplots() ax1.set_title('Box Plot') ax1.boxplot(datasets[0], positions=[1, 3, 5, 7, 9], widths=0.6) ax1.boxplot(datasets[1], positions=[2, 4, 6, 8, 10], widths=0.6) ax1.set_xticklabels([0, 1, 2, 3, 4, 5]) ax1.set_xticks([0, 1.5, 3.5, 5.5, 7.5, 9.5]) plt.savefig("plots/box_plot.png")""" p_color = 0 for populations in datasets: std = [] mean = [] for days in populations: mean.append(statistics.mean(days)) std.append(statistics.standard_deviation(days)) plt.plot(range(len(mean)), mean, colors[p_color % 7], label=labels[p_color % 7]) plt.plot(range(len(mean)), [sum(x) for x in zip(mean, std)], colors_b[p_color % 7], label="{} +/- $\sigma$".format(labels[p_color % 7])) plt.plot(range(len(mean)), [x - y for (x, y) in zip(mean, std)], colors_b[p_color % 7]) p_color += 1 ax.legend() plt.savefig("plots/{}/histories_deviation.png".format(directory))
def scale(data_matrix): num_rows, num_cols = shape(data_matrix) means = [mean(get_column(data_matrix, j)) for j in range(num_cols)] stdevs = [ standard_deviation(get_column(data_matrix, j)) for j in range(num_cols) ] return means, stdevs
def fit(self, train_data_features, train_data_labels): assert np.ndarray == type(train_data_features) assert np.ndarray == type(train_data_labels) assert train_data_features.shape[0] == len(train_data_labels) #labels will be used to calculate Prior P(X) and Likelihood P(X|C) probabilies #lets find the indices of labels """ ! we assume that label vector contains categorical values""" instances_by_label = {} for label in set(train_data_labels): instances_by_label[label] = np.where(train_data_labels == label)[0] #lets find Prior probability prior_probabilities = {} # a dicitinary (key: label, value: prior) for label in set(train_data_labels): prior_probabilities[label] = len( instances_by_label[label]) / float(len(train_data_labels)) #features will be used to calculate Likelihood P(X|C) probabilities categorical_columns = [] continuous_columns = [] for i in range(len(train_data_features[0])): try: _ = [ float(feature_value) for feature_value in set(train_data_features[:, i]) ] continuous_columns.append(i) #if it continue the feature column has continues values # so we deal with a distribution except: #if it drops here the feature column has categorical values categorical_columns.append(i) #continuous features cont_mat = train_data_features[:, continuous_columns].astype(float) likelihood_probability_arguments = { } # a dicitinary (key: label, value: [mean,standard deviation]) for label in set(train_data_labels): likelihood_probability_arguments[label] = [ st.mean(cont_mat[instances_by_label[label], :]), st.standard_deviation(cont_mat[instances_by_label[label], :]) ] #categorical features cate_mat = train_data_features[:, categorical_columns].astype(str) likelihood_probabilities = { } # a dicitinary (key: label, value: [mean,standard deviation]) for label in set(train_data_labels): sub_train_data_features = train_data_features[ instances_by_label[label], :] for i in range(cate_mat.shape[1]): feature_column = sub_train_data_features[:, i] for feature_value in set(feature_column): likelihood_tag = feature_value + "|" + label likelihood_probabilities[likelihood_tag] = len( np.where(feature_column == feature_value)[0]) / float( feature_column) #prepare package that will return trainining self._continues_columns = continuous_columns self._categorical_columns = categorical_columns self._prior_probabilities = prior_probabilities self._likelihood_probabilities = likelihood_probabilities self._likelihood_probability_arguments = likelihood_probability_arguments
def scale(matrix): """returns the means and standard deviations of each column""" num_rows, num_columns = shape(matrix) means = [mean(get_column(matrix, j)) for j in range(num_columns)] stdevs = [ standard_deviation(get_column(matrix, j)) for j in range(num_columns) ] return means, stdevs
def printComponentStatistics(): print("component statistics:\n") for i in range(N_COMPONENTS_TO_SHOW): print("component " + str(i) + ":") likeComp = likesByComponent[i] dislikeComp = dislikesByComponent[i] print("means: like = " + str(mean(likeComp)) + " dislike = " + str(mean(dislikeComp))) print("medians: like = " + str(median(likeComp)) + " dislike = " + str(median(dislikeComp))) print("stdevs: like = " + str(standard_deviation(likeComp)) + " dislike = " + str(standard_deviation(dislikeComp))) print("interquartile range: like = " + str(interquartile_range(likeComp)) + " dislike = " + str(interquartile_range(dislikeComp))) print("\n")
def make_random_dress(pca, save_name, liked): random_array = [] base = likesByComponent if liked else dislikesByComponent for c in base[:100]: mu = mean(c) sigma = standard_deviation(c) p = random.uniform(0.0, 1.0) num = inverse_normal_cdf(p, mu, sigma) random_array.append(num) construct(pca, random_array, 'results/createdDresses/' + save_name)
def scale(data: List[Vector]) -> Tuple[Vector, Vector]: """returns the means and standard deviations for each position""" dim = len(data[0]) means = vector_mean(data) stdevs = [ standard_deviation([vector[i] for vector in data]) for i in range(dim) ] return means, stdevs
def makeRandomDress(saveName, liked): randomArr = [] base = likesByComponent if liked else dislikesByComponent for c in base[:100]: mu = mean(c) sigma = standard_deviation(c) p = random.uniform(0.0, 1.0) num = inverse_normal_cdf(p, mu, sigma) randomArr.append(num) construct(randomArr, "results/createdDresses/" + saveName)
def makeRandomDress(saveName, liked): randomArr = [] base = likesByComponent if liked else dislikesByComponent for c in base[:100]: mu = mean(c) sigma = standard_deviation(c) p = random.uniform(0.0, 1.0) num = inverse_normal_cdf(p, mu, sigma) randomArr.append(num) construct(randomArr, 'results/createdDresses/' + saveName)
def scale(data: List[Vector]) -> Tuple[Vector, Vector]: "Return the column-wise mean and standard deviation of a dataset" dim = len(data[0]) means = vector_mean(data) std_devs = [ standard_deviation([vector[i] for vector in data]) for i in range(dim) ] return means, std_devs
def print_component_statistics(all_by_components, n_components_to_show=N_COMPONENTS_TO_SHOW): print("component statistics:\n") for i in range(n_components_to_show): print("component " + str(i) + ":") comp = all_by_components[i] print("means: like = " + str(mean(comp))) print( "medians: like = " + str(median(comp))) print("stdevs: like = " + str(standard_deviation(comp))) print("interquartile range: like = " + str(interquartile_range(comp))) print("\n")
def print_component_statistics(all_by_components, n_components_to_show=N_COMPONENTS_TO_SHOW): print("component statistics:\n") for i in range(n_components_to_show): print("component " + str(i) + ":") comp = all_by_components[i] print("means: like = " + str(mean(comp))) print("medians: like = " + str(median(comp))) print("stdevs: like = " + str(standard_deviation(comp))) print("interquartile range: like = " + str(interquartile_range(comp))) print("\n")
def estimation(num, attempts): estimates = [] for _ in range(attempts): pi_estimation = setup_points(num) estimates.append(pi_estimation) estimates_mean = mean(estimates) sigma = standard_deviation(estimates) print( f'mean={round(estimates_mean, 5)}, sigma={round(sigma, 5)}, points={num}' ) return estimates_mean, sigma
def statisticalPlot(pingcalls): # creat data for Gnuplot mini, avg, maxi = statistics.minavgmax(pingcalls) dat = [mini, avg, maxi, statistics.standard_deviation(pingcalls, avg)] # set graph properties g3 = Gnuplot.Gnuplot(persist=1) g3.reset() # reset graph g3.title('statistical information') # set title of graph g3.xlabel('Stats') # set title of x-axis g3.ylabel('Zeit in ms') # set title of y-axis g3('set autoscale') # autoscale g3("set xtics ('minimum' 0, 'average' 1, 'maximum' 2, 'standard_deviation' 3)") g3.plot(Gnuplot.Data(dat, with_='boxes'))
def printComponentStatistics(): print ("component statistics:\n") for i in range(min(N_COMPONENTS_TO_SHOW, numComponents, len(likesByComponent), len(dislikesByComponent))): print ("component " + str(i) + ":") likeComp = likesByComponent[i] dislikeComp = dislikesByComponent[i] print ("means: like = " + str(mean(likeComp)) + " dislike = " + str(mean(dislikeComp))) print ( "medians: like = " + str(median(likeComp)) + " dislike = " + str(median(dislikeComp)) ) print ( "stdevs: like = " + str(standard_deviation(likeComp)) + " dislike = " + str(standard_deviation(dislikeComp)) ) print ( "interquartile range: like = " + str(interquartile_range(likeComp)) + " dislike = " + str(interquartile_range(dislikeComp)) ) print ("\n")
def estimation(number_of_needles, number_of_tries): estimations = [] for _ in range(number_of_tries): pi_estimation = throw_needles(number_of_needles) estimations.append(pi_estimation) median_estimation = median(estimations) sigma = standard_deviation(estimations) print( f'Est={round(median_estimation, 5)}, sigma={round(sigma, 5)}, agujas={number_of_needles}' ) return (median_estimation, sigma)
def binned_means_graph_errors(data, name, min_count=10, min_width=10, title='', xlabel='', ylabel=''): # Expect data to be of the form: # [(x,y), ...] data = sorted(data, key=operator.itemgetter(0)) binner = Binner(data, min_count, min_width) binner.bin() pieces = binner.get_bins() points = [ ] for piece in pieces: xdata, ydata = zip(*piece) # x error is bin width / 2 xmax = max(xdata) xmin = min(xdata) xwidth = xmax - xmin xcentre = xmin + (xwidth / 2.0) xerror = xwidth / 2 ymean = mean(ydata) ysigma = standard_deviation(ydata, ymean) yerror = ysigma / math.sqrt(len(ydata)) points.append((xcentre, ymean, xerror, yerror)) return graph_errors(points, name, title=title, xlabel=xlabel, ylabel=ylabel)
def plotByCategory2(self): figure = plot.figure() canvas = FigureCanvas(figure) xl = pd.ExcelFile("../corpus-final09.xls") df = xl.parse("File list") colName = 'LCS Ratio (By Sentence, Advanced Pre)' # selectNon = df.loc[df['Category'] == 'non'] categories = ['non','heavy','light','cut'] colors = ['y.','g.','b.','r.'] ax = figure.add_subplot(111) ax.axis([-1,4,0,1.2]) values = [] means = [] std_devs = [] for x in range(0,4): values.append(df.loc[df['Category'] == categories[x]][colName]) ax.plot(len(values[x])*[x],values[x],colors[x]) m = stats.mean(values[x]) std = stats.standard_deviation(values[x]) m_str = str('%.3f' % round(m,3)) std_str = str('%.3f' % round(std,3)) ax.text((x+.1),m,'mean = '+m_str+'\nstd = '+std_str) handles,labelsX = ax.get_legend_handles_labels() ax.legend(handles,categories,loc='lower right') ax.yaxis.grid() ax.xaxis.set_ticklabels(['','non','heavy','light','cut','']) canvas.draw() return canvas
def test_standard_deviation(self): self.assertEqual(math.sqrt(10), statistics.standard_deviation([1, 2, 3, 4, 4, 10])) self.assertEqual(0, statistics.standard_deviation([1])) self.assertEqual(0, statistics.standard_deviation([]))
[200 + random.random() for _ in range(50)]) print "bootstrap_statistic(close_to_100, median, 100):" print bootstrap_statistic(close_to_100, median, 100) print "bootstrap_statistic(far_from_100, median, 100):" print bootstrap_statistic(far_from_100, median, 100) print random.seed(0) # so that you get the same results as me bootstrap_betas = bootstrap_statistic(zip(x, daily_minutes_good), estimate_sample_beta, 100) bootstrap_standard_errors = [ standard_deviation([beta[i] for beta in bootstrap_betas]) for i in range(4)] print "bootstrap standard errors", bootstrap_standard_errors print print "p_value(30.63, 1.174)", p_value(30.63, 1.174) print "p_value(0.972, 0.079)", p_value(0.972, 0.079) print "p_value(-1.868, 0.131)", p_value(-1.868, 0.131) print "p_value(0.911, 0.990)", p_value(0.911, 0.990) print print "regularization" random.seed(0) for alpha in [0.0, 0.01, 0.1, 1, 10]:
"""Evaluates stats_fn on num_samples bootstrap samples from data""" return [stats_fn(bootstrap_sample(data)) for _ in range(num_samples)] import random # 101 points all very close to 100 close_to_100 = [99.5 + random.random() for _ in range(101)] # 101 points, 50 of them near 0 and 50 of them near 100 far_from_100 = ([99.5 + random.random()] + [random.random() for _ in range(50)] + [200 + random.random() for _ in range(50)]) from statistics import median, standard_deviation; medians_close = bootstrap_statistic(close_to_100, median, 100) medians_far = bootstrap_statistic(far_from_100, median, 100) std_close = standard_deviation(medians_close) std_far = standard_deviation(medians_far) print(f"std_medians_close = {std_close}, std_medians_far = {std_far}") from typing import Tuple import datetime def estimate_sample_beta(pairs: List[Tuple[Vector, float]]) -> Vector: x_sample = [x for x,_ in pairs] y_sample = [y for _, y in pairs] beta = least_squares_fit(x_sample, y_sample, learning_rate, 5000, 25) print("Bootstrap sample", beta) return beta random.random()
def least_squares_fit(x, y): """given training values for x and y find the least-squares values of alpha and beta""" beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x) alpha = mean(y) - beta * mean(x) return alpha, beta
# import numpy import random import statistics import matplotlib.pyplot as plt ''' Calculate the mean, mode, median, standard deviation and variance. ''' data = numpy.random.normal(1, 0.5, 10000) mean = statistics.mean(data) mode = statistics.mode(data) median = statistics.median(data) std = statistics.standard_deviation(data) variance = statistics.variance(data) print('') print('Mean: ' + str(mean)) print('Mode: ' + str(mode)) print('Median: ' + str(median)) print('Standard Variance: ' + str(std)) print('Variance: ' + str(variance)) plt.xlabel('value') plt.ylabel('count') plt.hist(data, 50) plt.show() ''' Calculate the covariance of uncorrelated data.
def calculate_zscore(value, value_list): return (value - statistics.mean(value_list)) / statistics.standard_deviation(value_list)
12.16, 30.7, 31.22, 34.65, 13.13, 27.51, 33.2, 31.57, 14.1, 33.42, 17.44, 10.12, 24.42, 9.82, 23.39, 30.93, 15.03, 21.67, 31.09, 33.29, 22.61, 26.89, 23.48, 8.38, 27.81, 32.35, 23.84] random.seed(0) beta = estimate_beta(x, daily_minutes_good) # [30.63, 0.972, -1.868, 0.911] print "beta", rounded(beta) print "r-squared", rounded(multiple_r_squared(x, daily_minutes_good, beta)) close_to_100 = [99.5 + random.random() for _ in range(101)] far_from_100 = ([99.5 + random.random()] + [random.random() for _ in range(50)] + [200 + random.random() for _ in range(50)]) print print "medians for bootstrapped tight distribution", [round(val, 2) for val in sorted(bootstrap_statistic(close_to_100, median, 100))] print "medians for bootstrapped extreme distribution", [round(val, 2) for val in sorted(bootstrap_statistic(far_from_100, median, 100))] random.seed(0) bootstrap_betas = bootstrap_statistic(zip(x, daily_minutes_good), estimate_sample_beta, 100) bootstrap_standard_errors = [standard_deviation([beta[i] for beta in bootstrap_betas]) for i in range(4)] print print bootstrap_standard_errors print random.seed(0) for alpha in (0.0, 0.01, 0.1, 1, 10): beta_0 = estimate_beta_ridge(x, daily_minutes_good, alpha=alpha) print alpha, ' : ', rounded(beta_0), rounded(dot(beta_0[1:], beta_0[1:])), rounded(multiple_r_squared(x, daily_minutes_good, beta_0))
def mod_SNR(class_1_values, class_2_values): return abs( (statistics.mean(class_1_values) - statistics.mean(class_2_values)) / (statistics.standard_deviation(class_1_values) + statistics.standard_deviation(class_2_values)))
def scale(data: List[Vector]) -> Tuple[Vector, Vector]: dim = len(data[0]) means = vector_mean(data) stdevs = [standard_deviation([vector[i] for vector in data]) for i in range(dim)] return means, stdevs
def least_squares_fit(x,y): beta = correlation(x,y) * standard_deviation(y) / standard_deviation(x) alpha = mean(y) - beta * mean(x) return alpha, beta
def main(): from statistics import daily_minutes_good random.seed(0) # I used trial and error to choose niters and step_size. # This will run for a while. learning_rate = 0.001 beta = least_squares_fit(inputs, daily_minutes_good, learning_rate, 5000, 25) assert 30.50 < beta[0] < 30.70 # constant assert 0.96 < beta[1] < 1.00 # num friends assert -1.89 < beta[2] < -1.85 # work hours per day assert 0.91 < beta[3] < 0.94 # has PhD assert 0.67 < multiple_r_squared(inputs, daily_minutes_good, beta) < 0.68 from typing import Tuple def estimate_sample_beta(pairs: List[Tuple[Vector, float]]): x_sample = [x for x, _ in pairs] y_sample = [y for _, y in pairs] beta = least_squares_fit(x_sample, y_sample, learning_rate, 5000, 25) print("bootstrap sample", beta) return beta random.seed(0) # so that you get the same results as me # This will take a couple of minutes! bootstrap_betas = bootstrap_statistic( list(zip(inputs, daily_minutes_good)), estimate_sample_beta, 100) bootstrap_standard_errors = [ standard_deviation([beta[i] for beta in bootstrap_betas]) for i in range(4) ] print(bootstrap_standard_errors) # [1.272, # constant term, actual error = 1.19 # 0.103, # num_friends, actual error = 0.080 # 0.155, # work_hours, actual error = 0.127 # 1.249] # phd, actual error = 0.998 random.seed(0) beta_0 = least_squares_fit_ridge( inputs, daily_minutes_good, 0.0, # alpha learning_rate, 5000, 25) # [30.51, 0.97, -1.85, 0.91] assert 5 < dot(beta_0[1:], beta_0[1:]) < 6 assert 0.67 < multiple_r_squared(inputs, daily_minutes_good, beta_0) < 0.69 beta_0_1 = least_squares_fit_ridge( inputs, daily_minutes_good, 0.1, # alpha learning_rate, 5000, 25) # [30.8, 0.95, -1.83, 0.54] assert 4 < dot(beta_0_1[1:], beta_0_1[1:]) < 5 assert 0.67 < multiple_r_squared(inputs, daily_minutes_good, beta_0_1) < 0.69 beta_1 = least_squares_fit_ridge( inputs, daily_minutes_good, 1, # alpha learning_rate, 5000, 25) # [30.6, 0.90, -1.68, 0.10] assert 3 < dot(beta_1[1:], beta_1[1:]) < 4 assert 0.67 < multiple_r_squared(inputs, daily_minutes_good, beta_1) < 0.69 beta_10 = least_squares_fit_ridge( inputs, daily_minutes_good, 10, # alpha learning_rate, 5000, 25) # [28.3, 0.67, -0.90, -0.01] assert 1 < dot(beta_10[1:], beta_10[1:]) < 2 assert 0.5 < multiple_r_squared(inputs, daily_minutes_good, beta_10) < 0.6
def least_squares_fit(x, y): """numerical 'perfect' determination of alpha, beta for linear regression""" beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x) alpha = mean(y) - beta * mean(x) return alpha, beta
[random.random() for _ in range(50)] + [200 + random.random() for _ in range(50)]) print print "medians for bootstrapped tight distribution", [ round(val, 2) for val in sorted(bootstrap_statistic(close_to_100, median, 100)) ] print "medians for bootstrapped extreme distribution", [ round(val, 2) for val in sorted(bootstrap_statistic(far_from_100, median, 100)) ] random.seed(0) bootstrap_betas = bootstrap_statistic(zip(x, daily_minutes_good), estimate_sample_beta, 100) bootstrap_standard_errors = [ standard_deviation([beta[i] for beta in bootstrap_betas]) for i in range(4) ] print print bootstrap_standard_errors print random.seed(0) for alpha in (0.0, 0.01, 0.1, 1, 10): beta_0 = estimate_beta_ridge(x, daily_minutes_good, alpha=alpha) print alpha, ' : ', rounded(beta_0), rounded( dot(beta_0[1:], beta_0[1:])), rounded( multiple_r_squared(x, daily_minutes_good, beta_0))
# 101 points all very close to 100 close_to_100 = [99.5 + random.random() for _ in range(101)] # 101 points, 50 of them near 0, 50 of them near 200 far_from_100 = ([99.5 + random.random()] + [random.random() for _ in range(50)] + [200 + random.random() for _ in range(50)]) from statistics import median, standard_deviation medians_close = bootstrap_statistic(close_to_100, median, 100) medians_far = bootstrap_statistic(far_from_100, median, 100) assert standard_deviation(medians_close) < 1 assert standard_deviation(medians_far) > 90 from probability import normal_cdf def p_value(beta_hat_j: float, sigma_hat_j: float) -> float: if beta_hat_j > 0: # if the coefficient is positive, we need to compute twice the # probability of seeing an even *larger* value return 2 * (1 - normal_cdf(beta_hat_j / sigma_hat_j)) else: # otherwise twice the probability of seeing a *smaller* value return 2 * normal_cdf(beta_hat_j / sigma_hat_j)
from dog import Dog from statistics import standard_deviation dogs = [] with open('dogs.txt') as f: for line in f: name, age, sex = line.strip().split(',') age = float(age) dog = Dog(name, age, sex) dogs.append(dog) ages = [dog.age for dog in dogs] avgage = sum(ages) / len(ages) stdage = standard_deviation(ages) print(dogs, avgage, stdage)