-
Notifications
You must be signed in to change notification settings - Fork 1
/
exploredata.py
120 lines (85 loc) · 3.63 KB
/
exploredata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
from __future__ import division
import math
import random
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import norm
from pprint import pprint
from collections import Counter
from utils import correlation
def bucketize(point, bucket_size):
'''floor the point into a bucket'''
return bucket_size * math.floor(point / bucket_size)
def make_histogram(points, bucket_size):
return Counter(bucketize(point, bucket_size) for point in points)
def plot_histogram(points, bucket_size, title=''):
histogram = make_histogram(points, bucket_size)
plt.bar(histogram.keys(), histogram.values(), width=bucket_size)
plt.title(title)
plt.show()
def plot1d():
random.seed(0)
# uniform between -100 and 100
uniform = [200 * random.random() - 100 for _ in range(10000)]
# normal distribution with mean 0, standard deviation 57
normal = [57 * norm.ppf(random.random())
for _ in range(10000)]
plot_histogram(uniform, 10, "Uniform Histogram")
plot_histogram(normal, 10, "Normal Histogram")
def random_normal():
'''returns a random draw from a standard normal distribution'''
return norm.ppf(random.random())
def plot2d():
xs = [random_normal() for _ in range(1000)]
ys1 = [x + random_normal() / 2 for x in xs]
ys2 = [-x + random_normal() / 2 for x in xs]
print(correlation(xs, ys1))
print(correlation(xs, ys2))
plt.scatter(xs, ys1, marker='.', color='black', label='ys1')
plt.scatter(xs, ys2, marker='.', color='gray', label='ys2')
plt.xlabel('xs')
plt.ylabel('ys')
plt.legend(loc=9)
plt.title('Very Different Joint Distributions')
plt.show()
def correlation_matrix(data):
'''returns the num_columns x num_columns matrix whose (i, j)th entry
is the correlation between columns i and j of data'''
_, num_columns = data.shape()
def matrix_entry(i, j):
return correlation(data[:,i], data[:,j])
return make_matrix(num_columns, num_columns, matrix_entry)
def plot_correlation_matrix():
num_points = 100
def random_row():
row = [None, None, None, None]
row[0] = random_normal()
row[1] = -5 * row[0] + random_normal()
row[2] = row[0] + row[1] + 5 * random_normal()
row[3] = 6 if row[2] > -2 else 0
return row
random.seed(0)
data = np.matrix([random_row() for _ in range(num_points)])
_, num_columns = data.shape
fig, ax = plt.subplots(num_columns, num_columns)
for i in range(num_columns):
for j in range(num_columns):
# scatter column_j on the x-axis vs column_i on the y-axis
if i != j: ax[i][j].scatter(data[:,j], data[:,i])
# unless i == j, in which case show the series name
else: ax[i][j].annotate('series ' + str(i), (0.5, 0.5),
xycoords='axes fraction',
ha='center', va='center')
# then hide axis labels except left and bottom charts
if i < num_columns - 1: ax[i][j].xaxis.set_visible(False)
if j > 0: ax[i][j].yaxis.set_visible(False)
# fix the bottom right and top left axis labels, which are wrong because
# their charts only have text in them
ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
ax[0][0].set_ylim(ax[0][1].get_ylim())
plt.show()
if __name__ == '__main__':
#plot1d()
#plot2d()
plot_correlation_matrix()