/
working_with_data.py
116 lines (86 loc) · 3.39 KB
/
working_with_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# %%
from typing import List, Dict
from collections import Counter
import math
import matplotlib.pyplot as plt
def bucketize(point: float, bucket_size: float) -> float:
"""Floor the point to the next lower multiple of bucket size"""
return bucket_size * math.floor(point / bucket_size)
def make_histogram(points: List[float], bucket_size: float) -> Dict[float, int]:
"""Buckets the points and counts how many in each bucket"""
return Counter(bucketize(point, bucket_size) for point in points)
def plot_histogram(points: List[float], bucket_size: float, title: str = ""):
histogram = make_histogram(points, bucket_size)
plt.bar(histogram.keys(), histogram.values(), width=bucket_size)
plt.title(title)
# %%
import random
from probability import inverse_normal_cdf
random.seed(0)
# uniform between -100 and 100
uniform = [200 * random.random() - 100 for _ in range(10000)]
# normal distribution with mean 0, sd 57
normal = [57 * inverse_normal_cdf(random.random()) for _ in range(10000)]
# %%
plot_histogram(uniform,10, "Uniform Hist")
# %%
plot_histogram(normal, 10, 'Normal Hist')
# %%
def random_normal() -> float:
"""Returns a random draw from a standard normal distribution"""
return inverse_normal_cdf(random.random())
xs = [random_normal() for _ in range(1000)]
ys1 = [ x + random_normal() / 2 for x in xs]
ys2 = [-x + random_normal() / 2 for x in xs]
# %%
plt.scatter(xs, ys1, marker='.', color='black', label='ys1')
plt.scatter(xs, ys2, marker='.', color='gray', label='ys2')
plt.xlabel('xs')
plt.ylabel('ys')
plt.legend(loc=9)
plt.title("Very Different Joint Distributions")
plt.show()
# %%
from statisticss import correlation
print(correlation(xs, ys1))
print(correlation(xs, ys2))
# %%
from linear_algebra import Matrix, Vector, make_matrix
def correlation_matrix(data: List[Vector]) -> Matrix:
"""
Returns the len(data) x len(data) matrix whose (i, j)-th entry is the correlation between data[i] and data[j]
"""
def correlation_ij(i: int, j:int) -> float:
return correlation(data[i], data[j])
return make_matrix(len(data), len(data), correlation_ij)
# %%
from typing import List
# Just some random data to show off correlation scatterplots
num_points = 100
def random_row() -> List[float]:
row = [0.0, 0, 0, 0]
row[0] = random_normal()
row[1] = -5 * row[0] + random_normal()
row[2] = row[0] + row[1] + 5 * random_normal()
row[3] = 6 if row[2] > -2 else 0
return row
corr_rows = [random_row() for _ in range(num_points)]
corr_data = [list(col) for col in zip(*corr_rows)]
# %%
# corr_data is a list of four 100-d vectors
num_vectors = len(corr_data)
fig, ax = plt.subplots(num_vectors, num_vectors)
for i in range(num_vectors):
for j in range(num_vectors):
# Scatter column_j on the x-axis vs. column_i on the y-axis
if i != j: ax[i][j].scatter(corr_data[j], corr_data[i])
# unless i == j, in which case show the series name
else: ax[i][j].annotate("series " + str(i), (.5, .5), xycoords='axes fraction', ha="center", va="center")
# Then hide axis labels except left and bottom charts
if i < num_vectors - 1: ax[i][j].xaxis.set_visible(False)
if j > 0: ax[i][j].yaxis.set_visible(False)
# fix the bottom-right and top left axis labels, which are off because only contain text
ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
ax[0][0].set_ylim(ax[0][1].get_ylim())
plt.show()
# %%