-
Notifications
You must be signed in to change notification settings - Fork 0
/
correlation.py
215 lines (156 loc) · 4.28 KB
/
correlation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""
This file contains code used in "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2010 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
NAME: correlation.py
"""
import math
import random
import _03_thinkstats
def _cov(xs, ys, mux=None, muy=None):
"""
Computes Cov(X, Y).
Args:
xs: sequence of values
ys: sequence of values
mux: optional float mean of xs
muy: optional float mean of ys
Returns:
Cov(X, Y)
"""
if mux is None:
mux = _03_thinkstats._mean(xs)
if muy is None:
muy = _03_thinkstats._mean(ys)
total = 0.0
for x, y in zip(xs, ys):
total += (x - mux) * (y - muy)
return total / len(xs)
def _corr(xs, ys):
"""
Computes Corr(X, Y).
Args:
xs: sequence of values
ys: sequence of values
Returns:
Corr(X, Y)
"""
xbar, varx = _03_thinkstats._mean_var(xs)
ybar, vary = _03_thinkstats._mean_var(ys)
corr = _cov(xs, ys, xbar, ybar) / math.sqrt(varx * vary)
return corr
def _serial_corr(xs):
"""Computes the serial correlation of a sequence."""
return _corr(xs[:-1], xs[1:])
def _spearman_corr(xs, ys):
"""
Computes Spearman's rank correlation.
Args:
xs: sequence of values
ys: sequence of values
Returns:
float Spearman's correlation
"""
xranks = _map_to_ranks(xs)
yranks = _map_to_ranks(ys)
return _corr(xranks, yranks)
def _least_squares(xs, ys):
"""
Computes a linear least squares fit for ys as a function of xs.
Args:
xs: sequence of values
ys: sequence of values
Returns:
tuple of (intercept, slope)
"""
xbar, varx = _03_thinkstats._mean_var(xs)
ybar, vary = _03_thinkstats._mean_var(ys)
slope = _cov(xs, ys, xbar, ybar) / varx
inter = ybar - slope * xbar
return inter, slope
def _fit_line(xs, inter, slope):
"""
Returns the fitted line for the range of xs.
Args:
xs: x values used for the fit
slope: estimated slope
inter: estimated intercept
"""
fxs = min(xs), max(xs)
fys = [x * slope + inter for x in fxs]
return fxs, fys
def _residuals(xs, ys, inter, slope):
"""
Computes residuals for a linear fit with parameters inter and slope.
Args:
xs: independent variable
ys: dependent variable
inter: float intercept
slope: float slope
Returns:
list of residuals
"""
res = [y - inter - slope * x for x, y in zip(xs, ys)]
return res
def _coef_determination(ys, res):
"""
Computes the coefficient of determination (R^2) for given residuals.
Args:
ys: dependent variable
res: residuals
Returns:
float coefficient of determination
"""
ybar, vary = _03_thinkstats._mean_var(ys)
resbar, varres = _03_thinkstats._mean_var(res)
return 1 - varres / vary
def _map_to_ranks(t):
"""
Returns a list of ranks corresponding to the elements in t.
Args:
t: sequence of numbers
Returns:
list of integer ranks, starting at 1
"""
# pair up each value with its index
pairs = enumerate(t)
# sort by value
sorted_pairs = sorted(pairs, key=lambda pair: pair[1])
# pair up each pair with its rank
ranked = enumerate(sorted_pairs)
# sort by index
resorted = sorted(ranked, key=lambda trip: trip[1][0])
# extract the ranks
ranks = [trip[0] + 1 for trip in resorted]
return ranks
def _correlated_generator(rho):
"""
Generates standard normal variates with correlation.
Args:
rho: target coefficient of correlation
Returns:
iterable
"""
x = random.gauss(0, 1)
yield x
sigma = math.sqrt(1 - rho ** 2)
while True:
x = random.gauss(x * rho, sigma)
yield x
def _correlated_normal_generator(mu, sigma, rho):
"""
Generates normal variates with correlation.
Args:
mu: mean of variate
sigma: standard deviation of variate
rho: target coefficient of correlation
Returns:
iterable
"""
for x in _correlated_generator(rho):
yield x * sigma + mu
def main():
pass
if __name__ == '__main__':
main()