def sort_by_standard_deviation():

    global data_matrix
    global ALL_count
    global snr_tuples
    global progressbar_total

    sample_length = len(data_matrix[0])
    sample_count = len(data_matrix)

    for attribute_index in range(1, sample_length):

        attribute_list = list()

        for sample_index in range(sample_count):
            attribute_list.append(data_matrix[sample_index][attribute_index])

        sd_value = statistics.standard_deviation(
            attribute_list[:ALL_count]) + statistics.standard_deviation(attribute_list[(ALL_count + 1):])
        rounded_sd = math.ceil(sd_value * 10000) / 10000
        snr_tuples.append((attribute_index, rounded_sd))

    sort.randomized_quick_sort_for_tuples(snr_tuples, 0, len(snr_tuples) - 1)

    progressbar.show(7, progressbar_total, prefix='Progress:',
                     suffix='Complete', length=50)
def least_squares_fit(x: Vector, y: Vector) -> Tuple[float, float]:
    """Given two vectors x and y,
    find the least-squares value of alpha and beta"""
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    #print(alpha, beta)
    return alpha, beta
Ejemplo n.º 3
0
def sort_by_standard_deviation():

    global transposed_raw_data_matrix
    global ALL_count
    global MLL_count
    global AML_count
    global snr_tuples
    global progressbar_total

    sample_length = len(transposed_raw_data_matrix[0])
    sample_count = len(transposed_raw_data_matrix)
    MLL_end_index = ALL_count + MLL_count

    for attribute_index in range(sample_length - 1):

        attribute_list = list()

        for sample_index in range(sample_count):
            attribute_list.append(
                transposed_raw_data_matrix[sample_index][attribute_index])

        sd_value = statistics.standard_deviation(
            attribute_list[:ALL_count]) + statistics.standard_deviation(
                attribute_list[ALL_count:MLL_end_index]
            ) + statistics.standard_deviation(attribute_list[MLL_end_index:])
        rounded_sd = math.ceil(sd_value * 10000) / 10000
        snr_tuples.append((attribute_index, rounded_sd))

    sort.randomized_quick_sort_for_tuples(snr_tuples, 0, len(snr_tuples) - 1)

    progressbar.show(9,
                     progressbar_total,
                     prefix="Progress:",
                     suffix="Complete",
                     length=50)
Ejemplo n.º 4
0
def least_squares_fit(x, y):
    """given training values for x and y,
    find the least-squares values of alpha and beta"""

    #x的系数β=xy相关系数*y的标准差/x的标准差
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
Ejemplo n.º 5
0
def print_component_statistics_old(n_components_to_show=N_COMPONENTS_TO_SHOW):
    print("component statistics:\n")
    for i in range(n_components_to_show):
        print("component " + str(i) + ":")
        like_comp = likesByComponent[i]
        dislike_comp = dislikesByComponent[i]
        print("means:                     like = " + str(mean(like_comp)) + "     dislike = " + str(mean(dislike_comp)))
        print(
            "medians:                   like = " + str(median(like_comp)) + "     dislike = " + str(median(dislike_comp)))
        print("stdevs:                    like = " + str(standard_deviation(like_comp)) + "     dislike = " + str(
            standard_deviation(dislike_comp)))
        print("interquartile range:       like = " + str(interquartile_range(like_comp)) + "     dislike = " + str(
            interquartile_range(dislike_comp)))
        print("\n")
Ejemplo n.º 6
0
    def click_button_add_and_close(self, window, choice_of_calculation):

        if self.check_empty_combobox_graph():
            return

        analyzed_model = self.get_model()
        analysis_model = self.get_analysis(analyzed_model)

        if choice_of_calculation == 1:
            statistics.check_stationarity_click_button(analysis_model)

        if choice_of_calculation == 2:
            statistics.average_value_click_button(analysis_model)

        if choice_of_calculation == 3:
            statistics.dispersion_click_button(analysis_model)

        if choice_of_calculation == 4:
            statistics.dispersion_x_10_click_button(analysis_model)

        if choice_of_calculation == 5:
            statistics.standard_deviation(analysis_model)

        if choice_of_calculation == 6:
            statistics.asymmetry_click_button(analysis_model)

        if choice_of_calculation == 7:
            statistics.asymmetry_coefficient_click_button(analysis_model)

        if choice_of_calculation == 8:
            statistics.excess_click_button(analysis_model)

        if choice_of_calculation == 9:
            statistics.kurtosis_click_button(analysis_model)

        if choice_of_calculation == 10:
            statistics.standard_ratio_click_button(analysis_model)

        if choice_of_calculation == 11:
            statistics.mean_absolute_deviation_click_button(analysis_model)

        if choice_of_calculation == 12:
            statistics.x_min_click_button(analysis_model)

        if choice_of_calculation == 13:
            statistics.x_max_click_button(analysis_model)

        window.destroy()
Ejemplo n.º 7
0
def scale(data_matrix):
    """returns the mean and standard deviation of each column"""
    num_rows, num_cols = shape(data_matrix)
    means = [mean(get_column(data_matrix,j)) for j in range(num_cols)]
    stdevs = [standard_deviation(get_column(data_matrix, j))
              for j in range(num_cols)]
    return means, stdevs
def scale(data_matrix):
    num_rows, num_cols = shape(data_matrix)
    means = [mean(get_column(data_matrix,j))
             for j in range(num_cols)]
    stdevs = [standard_deviation(get_column(data_matrix,j))
              for j in range(num_cols)]
    return means, stdevs
Ejemplo n.º 9
0
def plot_multiple_histories(histories, directory):

    colors = ['r', 'b', 'g', 'c', 'm', 'y', 'k']
    colors_b = ['r--', 'b--', 'g--', 'c--', 'm--', 'y--', 'k--']
    plt.clf()
    fig, ax = plt.subplots()
    ax.set_xlabel("$Days$")
    ax.set_ylabel("$Average-Population$")

    #print(histories)

    datasets = [[[0] * len(histories)
                 for _ in range(len(next(iter(histories[0].values()))))]
                for _ in range(len(histories[0]))]
    labels = []
    history = 0
    for n in histories:
        i = 0
        for h in n:
            if history == 0:
                labels.append(h)
            j = 0
            for val in n[h]:
                datasets[i][j][history] = val
                j += 1
            i += 1
        history += 1

    #print(datasets)
    """ fig1, ax1 = plt.subplots()
	ax1.set_title('Box Plot')
	ax1.boxplot(datasets[0], positions=[1, 3, 5, 7, 9], widths=0.6)
	ax1.boxplot(datasets[1], positions=[2, 4, 6, 8, 10], widths=0.6)

	ax1.set_xticklabels([0, 1, 2, 3, 4, 5])
	ax1.set_xticks([0, 1.5, 3.5, 5.5, 7.5, 9.5])

	plt.savefig("plots/box_plot.png")"""

    p_color = 0
    for populations in datasets:
        std = []
        mean = []
        for days in populations:
            mean.append(statistics.mean(days))
            std.append(statistics.standard_deviation(days))

        plt.plot(range(len(mean)),
                 mean,
                 colors[p_color % 7],
                 label=labels[p_color % 7])
        plt.plot(range(len(mean)), [sum(x) for x in zip(mean, std)],
                 colors_b[p_color % 7],
                 label="{} +/- $\sigma$".format(labels[p_color % 7]))
        plt.plot(range(len(mean)), [x - y for (x, y) in zip(mean, std)],
                 colors_b[p_color % 7])
        p_color += 1

    ax.legend()
    plt.savefig("plots/{}/histories_deviation.png".format(directory))
Ejemplo n.º 10
0
def scale(data_matrix):
    num_rows, num_cols = shape(data_matrix)
    means = [mean(get_column(data_matrix, j)) for j in range(num_cols)]
    stdevs = [
        standard_deviation(get_column(data_matrix, j)) for j in range(num_cols)
    ]
    return means, stdevs
Ejemplo n.º 11
0
 def fit(self, train_data_features, train_data_labels):
     assert np.ndarray == type(train_data_features)
     assert np.ndarray == type(train_data_labels)
     assert train_data_features.shape[0] == len(train_data_labels)
     #labels will be used to calculate Prior P(X) and Likelihood P(X|C) probabilies
     #lets find the indices of labels
     """ ! we assume that label vector contains categorical values"""
     instances_by_label = {}
     for label in set(train_data_labels):
         instances_by_label[label] = np.where(train_data_labels == label)[0]
     #lets find Prior probability
     prior_probabilities = {}  # a dicitinary (key: label, value: prior)
     for label in set(train_data_labels):
         prior_probabilities[label] = len(
             instances_by_label[label]) / float(len(train_data_labels))
     #features will be used to calculate Likelihood P(X|C) probabilities
     categorical_columns = []
     continuous_columns = []
     for i in range(len(train_data_features[0])):
         try:
             _ = [
                 float(feature_value)
                 for feature_value in set(train_data_features[:, i])
             ]
             continuous_columns.append(i)
             #if it continue the feature column has continues values
             # so we deal with a distribution
         except:
             #if it drops here the feature column has categorical values
             categorical_columns.append(i)
     #continuous features
     cont_mat = train_data_features[:, continuous_columns].astype(float)
     likelihood_probability_arguments = {
     }  # a dicitinary (key: label, value: [mean,standard deviation])
     for label in set(train_data_labels):
         likelihood_probability_arguments[label] = [
             st.mean(cont_mat[instances_by_label[label], :]),
             st.standard_deviation(cont_mat[instances_by_label[label], :])
         ]
     #categorical features
     cate_mat = train_data_features[:, categorical_columns].astype(str)
     likelihood_probabilities = {
     }  # a dicitinary (key: label, value: [mean,standard deviation])
     for label in set(train_data_labels):
         sub_train_data_features = train_data_features[
             instances_by_label[label], :]
         for i in range(cate_mat.shape[1]):
             feature_column = sub_train_data_features[:, i]
             for feature_value in set(feature_column):
                 likelihood_tag = feature_value + "|" + label
                 likelihood_probabilities[likelihood_tag] = len(
                     np.where(feature_column == feature_value)[0]) / float(
                         feature_column)
     #prepare package that will return trainining
     self._continues_columns = continuous_columns
     self._categorical_columns = categorical_columns
     self._prior_probabilities = prior_probabilities
     self._likelihood_probabilities = likelihood_probabilities
     self._likelihood_probability_arguments = likelihood_probability_arguments
def scale(matrix):
    """returns the means and standard deviations of each column"""
    num_rows, num_columns = shape(matrix)
    means = [mean(get_column(matrix, j)) for j in range(num_columns)]
    stdevs = [
        standard_deviation(get_column(matrix, j)) for j in range(num_columns)
    ]
    return means, stdevs
Ejemplo n.º 13
0
def printComponentStatistics():
    print("component statistics:\n")
    for i in range(N_COMPONENTS_TO_SHOW):
        print("component " + str(i) + ":")
        likeComp = likesByComponent[i]
        dislikeComp = dislikesByComponent[i]
        print("means:                     like = " + str(mean(likeComp)) +
              "     dislike = " + str(mean(dislikeComp)))
        print("medians:                   like = " + str(median(likeComp)) +
              "     dislike = " + str(median(dislikeComp)))
        print("stdevs:                    like = " +
              str(standard_deviation(likeComp)) + "     dislike = " +
              str(standard_deviation(dislikeComp)))
        print("interquartile range:       like = " +
              str(interquartile_range(likeComp)) + "     dislike = " +
              str(interquartile_range(dislikeComp)))
        print("\n")
Ejemplo n.º 14
0
def make_random_dress(pca, save_name, liked):
    random_array = []
    base = likesByComponent if liked else dislikesByComponent
    for c in base[:100]:
        mu = mean(c)
        sigma = standard_deviation(c)
        p = random.uniform(0.0, 1.0)
        num = inverse_normal_cdf(p, mu, sigma)
        random_array.append(num)
    construct(pca, random_array, 'results/createdDresses/' + save_name)
Ejemplo n.º 15
0
def scale(data: List[Vector]) -> Tuple[Vector, Vector]:
    """returns the means and standard deviations for each position"""
    dim = len(data[0])

    means = vector_mean(data)
    stdevs = [
        standard_deviation([vector[i] for vector in data]) for i in range(dim)
    ]

    return means, stdevs
Ejemplo n.º 16
0
def makeRandomDress(saveName, liked):
    randomArr = []
    base = likesByComponent if liked else dislikesByComponent
    for c in base[:100]:
        mu = mean(c)
        sigma = standard_deviation(c)
        p = random.uniform(0.0, 1.0)
        num = inverse_normal_cdf(p, mu, sigma)
        randomArr.append(num)
    construct(randomArr, "results/createdDresses/" + saveName)
Ejemplo n.º 17
0
def make_random_dress(pca, save_name, liked):
    random_array = []
    base = likesByComponent if liked else dislikesByComponent
    for c in base[:100]:
        mu = mean(c)
        sigma = standard_deviation(c)
        p = random.uniform(0.0, 1.0)
        num = inverse_normal_cdf(p, mu, sigma)
        random_array.append(num)
    construct(pca, random_array, 'results/createdDresses/' + save_name)
Ejemplo n.º 18
0
def makeRandomDress(saveName, liked):
    randomArr = []
    base = likesByComponent if liked else dislikesByComponent
    for c in base[:100]:
        mu = mean(c)
        sigma = standard_deviation(c)
        p = random.uniform(0.0, 1.0)
        num = inverse_normal_cdf(p, mu, sigma)
        randomArr.append(num)
    construct(randomArr, 'results/createdDresses/' + saveName)
Ejemplo n.º 19
0
def scale(data: List[Vector]) -> Tuple[Vector, Vector]:
    "Return the column-wise mean and standard deviation of a dataset"
    dim = len(data[0])

    means = vector_mean(data)
    std_devs = [
        standard_deviation([vector[i] for vector in data]) for i in range(dim)
    ]

    return means, std_devs
Ejemplo n.º 20
0
def print_component_statistics(all_by_components, n_components_to_show=N_COMPONENTS_TO_SHOW):
    print("component statistics:\n")
    for i in range(n_components_to_show):
        print("component " + str(i) + ":")
        comp = all_by_components[i]
        print("means:                     like = " + str(mean(comp)))
        print(
            "medians:                   like = " + str(median(comp)))
        print("stdevs:                    like = " + str(standard_deviation(comp)))
        print("interquartile range:       like = " + str(interquartile_range(comp)))
        print("\n")
Ejemplo n.º 21
0
def print_component_statistics(all_by_components,
                               n_components_to_show=N_COMPONENTS_TO_SHOW):
    print("component statistics:\n")
    for i in range(n_components_to_show):
        print("component " + str(i) + ":")
        comp = all_by_components[i]
        print("means:                     like = " + str(mean(comp)))
        print("medians:                   like = " + str(median(comp)))
        print("stdevs:                    like = " +
              str(standard_deviation(comp)))
        print("interquartile range:       like = " +
              str(interquartile_range(comp)))
        print("\n")
Ejemplo n.º 22
0
def estimation(num, attempts):
    estimates = []
    for _ in range(attempts):
        pi_estimation = setup_points(num)
        estimates.append(pi_estimation)

    estimates_mean = mean(estimates)
    sigma = standard_deviation(estimates)

    print(
        f'mean={round(estimates_mean, 5)}, sigma={round(sigma, 5)}, points={num}'
    )
    return estimates_mean, sigma
Ejemplo n.º 23
0
Archivo: plots.py Proyecto: r41d/kivs
def statisticalPlot(pingcalls):
	# creat data for Gnuplot
	mini, avg, maxi = statistics.minavgmax(pingcalls)
	dat = [mini, avg, maxi, statistics.standard_deviation(pingcalls, avg)]

	# set graph properties
	g3 = Gnuplot.Gnuplot(persist=1)
	g3.reset()									# reset graph
	g3.title('statistical information')		# set title of graph
	g3.xlabel('Stats')							# set title of x-axis
	g3.ylabel('Zeit in ms')					# set title of y-axis
	g3('set autoscale')						# autoscale
	g3("set xtics ('minimum' 0, 'average' 1, 'maximum' 2, 'standard_deviation' 3)")
	g3.plot(Gnuplot.Data(dat, with_='boxes'))
Ejemplo n.º 24
0
def printComponentStatistics():
    print ("component statistics:\n")
    for i in range(min(N_COMPONENTS_TO_SHOW, numComponents, len(likesByComponent), len(dislikesByComponent))):
        print ("component " + str(i) + ":")
        likeComp = likesByComponent[i]
        dislikeComp = dislikesByComponent[i]
        print ("means:                     like = " + str(mean(likeComp)) + "     dislike = " + str(mean(dislikeComp)))
        print (
            "medians:                   like = " + str(median(likeComp)) + "     dislike = " + str(median(dislikeComp))
        )
        print (
            "stdevs:                    like = "
            + str(standard_deviation(likeComp))
            + "     dislike = "
            + str(standard_deviation(dislikeComp))
        )
        print (
            "interquartile range:       like = "
            + str(interquartile_range(likeComp))
            + "     dislike = "
            + str(interquartile_range(dislikeComp))
        )
        print ("\n")
Ejemplo n.º 25
0
def estimation(number_of_needles, number_of_tries):
    estimations = []

    for _ in range(number_of_tries):
        pi_estimation = throw_needles(number_of_needles)

        estimations.append(pi_estimation)

    median_estimation = median(estimations)

    sigma = standard_deviation(estimations)

    print(
        f'Est={round(median_estimation, 5)}, sigma={round(sigma, 5)}, agujas={number_of_needles}'
    )

    return (median_estimation, sigma)
Ejemplo n.º 26
0
def binned_means_graph_errors(data, name, min_count=10, min_width=10, title='', xlabel='', ylabel=''):
	# Expect data to be of the form:
	# [(x,y), ...]
	data = sorted(data, key=operator.itemgetter(0))
	binner = Binner(data, min_count, min_width)
	binner.bin()
	pieces = binner.get_bins()
	points = [ ]
	for piece in pieces:
		xdata, ydata = zip(*piece)
		# x error is bin width / 2
		xmax = max(xdata)
		xmin = min(xdata)
		xwidth = xmax - xmin
		xcentre = xmin + (xwidth / 2.0)
		xerror = xwidth / 2
		ymean = mean(ydata)
		ysigma = standard_deviation(ydata, ymean)
		yerror = ysigma / math.sqrt(len(ydata))
		points.append((xcentre, ymean, xerror, yerror))
	return graph_errors(points, name, title=title, xlabel=xlabel, ylabel=ylabel)
Ejemplo n.º 27
0
def plotByCategory2(self):
    figure = plot.figure()
    canvas = FigureCanvas(figure)

    xl = pd.ExcelFile("../corpus-final09.xls")
    df = xl.parse("File list")

    colName = 'LCS Ratio (By Sentence, Advanced Pre)'

    # selectNon = df.loc[df['Category'] == 'non']
    
    categories = ['non','heavy','light','cut']
    colors = ['y.','g.','b.','r.']

    ax = figure.add_subplot(111)
    ax.axis([-1,4,0,1.2])

    values = []
    means = []
    std_devs = []
    for x in range(0,4):
        values.append(df.loc[df['Category'] == categories[x]][colName])
        ax.plot(len(values[x])*[x],values[x],colors[x])
        m = stats.mean(values[x])
        std = stats.standard_deviation(values[x])
        m_str = str('%.3f' % round(m,3))
        std_str = str('%.3f' % round(std,3))
        ax.text((x+.1),m,'mean = '+m_str+'\nstd =  '+std_str)


    handles,labelsX = ax.get_legend_handles_labels()
    ax.legend(handles,categories,loc='lower right')

    ax.yaxis.grid()
    ax.xaxis.set_ticklabels(['','non','heavy','light','cut',''])

    canvas.draw()
    return canvas
Ejemplo n.º 28
0
 def test_standard_deviation(self):
     self.assertEqual(math.sqrt(10), statistics.standard_deviation([1, 2, 3, 4, 4, 10]))
     self.assertEqual(0, statistics.standard_deviation([1]))
     self.assertEqual(0, statistics.standard_deviation([]))
                    [200 + random.random() for _ in range(50)])

    print "bootstrap_statistic(close_to_100, median, 100):"
    print bootstrap_statistic(close_to_100, median, 100)
    print "bootstrap_statistic(far_from_100, median, 100):"
    print bootstrap_statistic(far_from_100, median, 100)
    print

    random.seed(0) # so that you get the same results as me

    bootstrap_betas = bootstrap_statistic(zip(x, daily_minutes_good),
                                          estimate_sample_beta,
                                          100)

    bootstrap_standard_errors = [
        standard_deviation([beta[i] for beta in bootstrap_betas])
        for i in range(4)]

    print "bootstrap standard errors", bootstrap_standard_errors
    print

    print "p_value(30.63, 1.174)", p_value(30.63, 1.174)
    print "p_value(0.972, 0.079)", p_value(0.972, 0.079)
    print "p_value(-1.868, 0.131)", p_value(-1.868, 0.131)
    print "p_value(0.911, 0.990)", p_value(0.911, 0.990)
    print

    print "regularization"

    random.seed(0)
    for alpha in [0.0, 0.01, 0.1, 1, 10]:
    """Evaluates stats_fn on num_samples bootstrap samples from data"""
    return [stats_fn(bootstrap_sample(data)) for _ in range(num_samples)]

import random
# 101 points all very close to 100
close_to_100 = [99.5 + random.random() for _ in range(101)]
# 101 points, 50 of them near 0 and 50 of them near 100
far_from_100 = ([99.5 + random.random()] +
                [random.random() for _ in range(50)] +
                [200 + random.random() for _ in range(50)])

from statistics import median, standard_deviation;

medians_close = bootstrap_statistic(close_to_100, median, 100)
medians_far = bootstrap_statistic(far_from_100, median, 100)
std_close = standard_deviation(medians_close)
std_far = standard_deviation(medians_far)

print(f"std_medians_close = {std_close}, std_medians_far = {std_far}")

from typing import Tuple
import datetime

def estimate_sample_beta(pairs: List[Tuple[Vector, float]]) -> Vector:
    x_sample = [x for x,_ in pairs]
    y_sample = [y for _, y in pairs]
    
    beta = least_squares_fit(x_sample, y_sample, learning_rate, 5000, 25)
    print("Bootstrap sample", beta)
    return beta
random.random()
Ejemplo n.º 31
0
def least_squares_fit(x, y):
    """given training values for x and y
    find the least-squares values of alpha and beta"""
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
Ejemplo n.º 32
0
#

import numpy
import random
import statistics
import matplotlib.pyplot as plt
'''
Calculate the mean, mode, median, standard deviation and variance.
'''

data = numpy.random.normal(1, 0.5, 10000)

mean = statistics.mean(data)
mode = statistics.mode(data)
median = statistics.median(data)
std = statistics.standard_deviation(data)
variance = statistics.variance(data)

print('')
print('Mean: ' + str(mean))
print('Mode: ' + str(mode))
print('Median: ' + str(median))
print('Standard Variance: ' + str(std))
print('Variance: ' + str(variance))

plt.xlabel('value')
plt.ylabel('count')
plt.hist(data, 50)
plt.show()
'''
Calculate the covariance of uncorrelated data.
Ejemplo n.º 33
0
def calculate_zscore(value, value_list):
    return (value - statistics.mean(value_list)) / statistics.standard_deviation(value_list)
Ejemplo n.º 34
0
 def test_standard_deviation(self):
     self.assertEqual(math.sqrt(10),
                      statistics.standard_deviation([1, 2, 3, 4, 4, 10]))
     self.assertEqual(0, statistics.standard_deviation([1]))
     self.assertEqual(0, statistics.standard_deviation([]))
                          12.16, 30.7, 31.22, 34.65, 13.13, 27.51, 33.2, 31.57, 14.1, 33.42, 17.44, 10.12, 24.42, 9.82,
                          23.39, 30.93, 15.03, 21.67, 31.09, 33.29, 22.61, 26.89, 23.48, 8.38, 27.81, 32.35, 23.84]

    random.seed(0)
    beta = estimate_beta(x, daily_minutes_good)  # [30.63, 0.972, -1.868, 0.911]
    print "beta", rounded(beta)
    print "r-squared", rounded(multiple_r_squared(x, daily_minutes_good, beta))

    close_to_100 = [99.5 + random.random() for _ in range(101)]
    far_from_100 = ([99.5 + random.random()] +
                    [random.random() for _ in range(50)] +
                    [200 + random.random() for _ in range(50)])

    print
    print "medians for bootstrapped tight distribution", [round(val, 2) for val in sorted(bootstrap_statistic(close_to_100, median, 100))]
    print "medians for bootstrapped extreme distribution", [round(val, 2) for val in sorted(bootstrap_statistic(far_from_100, median, 100))]

    random.seed(0)
    bootstrap_betas = bootstrap_statistic(zip(x, daily_minutes_good),
                                          estimate_sample_beta,
                                          100)
    bootstrap_standard_errors = [standard_deviation([beta[i] for beta in bootstrap_betas]) for i in range(4)]

    print
    print bootstrap_standard_errors

    print
    random.seed(0)
    for alpha in (0.0, 0.01, 0.1, 1, 10):
        beta_0 = estimate_beta_ridge(x, daily_minutes_good, alpha=alpha)
        print alpha, ' : ', rounded(beta_0), rounded(dot(beta_0[1:], beta_0[1:])), rounded(multiple_r_squared(x, daily_minutes_good, beta_0))
Ejemplo n.º 36
0
def mod_SNR(class_1_values, class_2_values):
    return abs(
        (statistics.mean(class_1_values) - statistics.mean(class_2_values)) /
        (statistics.standard_deviation(class_1_values) +
         statistics.standard_deviation(class_2_values)))
Ejemplo n.º 37
0
def scale(data: List[Vector]) -> Tuple[Vector, Vector]:
    dim = len(data[0])
    means = vector_mean(data)
    stdevs = [standard_deviation([vector[i] for vector in data]) for i in range(dim)]
    return means, stdevs
Ejemplo n.º 38
0
def least_squares_fit(x,y):
    beta = correlation(x,y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
Ejemplo n.º 39
0
def main():
    from statistics import daily_minutes_good

    random.seed(0)
    # I used trial and error to choose niters and step_size.
    # This will run for a while.
    learning_rate = 0.001

    beta = least_squares_fit(inputs, daily_minutes_good, learning_rate, 5000,
                             25)
    assert 30.50 < beta[0] < 30.70  # constant
    assert 0.96 < beta[1] < 1.00  # num friends
    assert -1.89 < beta[2] < -1.85  # work hours per day
    assert 0.91 < beta[3] < 0.94  # has PhD

    assert 0.67 < multiple_r_squared(inputs, daily_minutes_good, beta) < 0.68

    from typing import Tuple

    def estimate_sample_beta(pairs: List[Tuple[Vector, float]]):
        x_sample = [x for x, _ in pairs]
        y_sample = [y for _, y in pairs]
        beta = least_squares_fit(x_sample, y_sample, learning_rate, 5000, 25)
        print("bootstrap sample", beta)
        return beta

    random.seed(0)  # so that you get the same results as me

    # This will take a couple of minutes!
    bootstrap_betas = bootstrap_statistic(
        list(zip(inputs, daily_minutes_good)), estimate_sample_beta, 100)

    bootstrap_standard_errors = [
        standard_deviation([beta[i] for beta in bootstrap_betas])
        for i in range(4)
    ]

    print(bootstrap_standard_errors)

    # [1.272,    # constant term, actual error = 1.19
    #  0.103,    # num_friends,   actual error = 0.080
    #  0.155,    # work_hours,    actual error = 0.127
    #  1.249]    # phd,           actual error = 0.998

    random.seed(0)
    beta_0 = least_squares_fit_ridge(
        inputs,
        daily_minutes_good,
        0.0,  # alpha
        learning_rate,
        5000,
        25)
    # [30.51, 0.97, -1.85, 0.91]
    assert 5 < dot(beta_0[1:], beta_0[1:]) < 6
    assert 0.67 < multiple_r_squared(inputs, daily_minutes_good, beta_0) < 0.69

    beta_0_1 = least_squares_fit_ridge(
        inputs,
        daily_minutes_good,
        0.1,  # alpha
        learning_rate,
        5000,
        25)
    # [30.8, 0.95, -1.83, 0.54]
    assert 4 < dot(beta_0_1[1:], beta_0_1[1:]) < 5
    assert 0.67 < multiple_r_squared(inputs, daily_minutes_good,
                                     beta_0_1) < 0.69

    beta_1 = least_squares_fit_ridge(
        inputs,
        daily_minutes_good,
        1,  # alpha
        learning_rate,
        5000,
        25)
    # [30.6, 0.90, -1.68, 0.10]
    assert 3 < dot(beta_1[1:], beta_1[1:]) < 4
    assert 0.67 < multiple_r_squared(inputs, daily_minutes_good, beta_1) < 0.69

    beta_10 = least_squares_fit_ridge(
        inputs,
        daily_minutes_good,
        10,  # alpha
        learning_rate,
        5000,
        25)
    # [28.3, 0.67, -0.90, -0.01]
    assert 1 < dot(beta_10[1:], beta_10[1:]) < 2
    assert 0.5 < multiple_r_squared(inputs, daily_minutes_good, beta_10) < 0.6
Ejemplo n.º 40
0
def least_squares_fit(x, y):
    """numerical 'perfect' determination of alpha, beta for linear regression"""
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
Ejemplo n.º 41
0
                    [random.random() for _ in range(50)] +
                    [200 + random.random() for _ in range(50)])

    print
    print "medians for bootstrapped tight distribution", [
        round(val, 2)
        for val in sorted(bootstrap_statistic(close_to_100, median, 100))
    ]
    print "medians for bootstrapped extreme distribution", [
        round(val, 2)
        for val in sorted(bootstrap_statistic(far_from_100, median, 100))
    ]

    random.seed(0)
    bootstrap_betas = bootstrap_statistic(zip(x, daily_minutes_good),
                                          estimate_sample_beta, 100)
    bootstrap_standard_errors = [
        standard_deviation([beta[i] for beta in bootstrap_betas])
        for i in range(4)
    ]

    print
    print bootstrap_standard_errors

    print
    random.seed(0)
    for alpha in (0.0, 0.01, 0.1, 1, 10):
        beta_0 = estimate_beta_ridge(x, daily_minutes_good, alpha=alpha)
        print alpha, ' : ', rounded(beta_0), rounded(
            dot(beta_0[1:], beta_0[1:])), rounded(
                multiple_r_squared(x, daily_minutes_good, beta_0))
Ejemplo n.º 42
0
# 101 points all very close to 100
close_to_100 = [99.5 + random.random() for _ in range(101)]

# 101 points, 50 of them near 0, 50 of them near 200
far_from_100 = ([99.5 + random.random()] +
                [random.random() for _ in range(50)] +
                [200 + random.random() for _ in range(50)])

from statistics import median, standard_deviation

medians_close = bootstrap_statistic(close_to_100, median, 100)

medians_far = bootstrap_statistic(far_from_100, median, 100)

assert standard_deviation(medians_close) < 1
assert standard_deviation(medians_far) > 90

from probability import normal_cdf


def p_value(beta_hat_j: float, sigma_hat_j: float) -> float:
    if beta_hat_j > 0:
        # if the coefficient is positive, we need to compute twice the
        # probability of seeing an even *larger* value
        return 2 * (1 - normal_cdf(beta_hat_j / sigma_hat_j))
    else:
        # otherwise twice the probability of seeing a *smaller* value
        return 2 * normal_cdf(beta_hat_j / sigma_hat_j)

Ejemplo n.º 43
0
from dog import Dog
from statistics import standard_deviation

dogs = []
with open('dogs.txt') as f:
    for line in f:
        name, age, sex = line.strip().split(',')
        age = float(age)
        dog = Dog(name, age, sex)
        dogs.append(dog)

ages = [dog.age for dog in dogs]
avgage = sum(ages) / len(ages)
stdage = standard_deviation(ages)
print(dogs, avgage, stdage)