/
agemodel.py
296 lines (222 loc) · 8.46 KB
/
agemodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
"""
This file contains code used in "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2010 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
NAME: agemodel.py
"""
import math
import matplotlib
import matplotlib.pyplot as pyplot
import correlation
import cumulative
import _01_survey
import _02_first
import _03_thinkstats
import _04_Pmf
import _05_myplot
import _06_descriptive
import _13_Cdf
"""
* Results:
First babies, ages, trimmed mean: 23.0784947977
Other babies, ages, trimmed mean: 26.6647683689
Difference in means: 3.5862735712
First babies, weights, trimmed mean: 115.558686539
Other babies, weights, trimmed mean: 117.734924078
Difference in means: 2.17623753873
* First babies are 2.2 oz lighter; their mothers are 3.5 years younger.
Pearson correlation 0.0683745752519
Spearman correlation 0.0987971917949
(inter, slope): 109.522876323 0.287729420217
R^2 0.00467508254087
* The units of inter are ounces; the units of slope are ounces per year.
* Each additional year adds 0.3 ounces to the mean birth weight.
* But the correlation is quite weak.
Weight difference explained by age: 1.03187641538
Fraction explained: 0.474156151163
* The age difference could account for 50% of the weight difference.
* If we bin births by mother's age, we see that the relationship is nonlinear,
* so the estimated slope is probably too low, which means that the fraction
* of the weight difference explained by age is probably more than 50%.
Bin Mean weight (oz)
10.0 117.295081967
15.0 113.487096774
20.0 116.505546218
25.0 118.190963342
30.0 118.323802716
35.0 118.743842365
40.0 114.054054054
* If we trim very low and very high weights, the correlations are a
* little higher, but the difference is small enough that it is a non-issue.
Pearson correlation 0.084795033619
Spearman correlation 0.103080620319
(inter, slope): 110.400666041 0.297751296651
R^2 0.00719019772646
"""
def _process(table, name):
"""
Runs various analyses on this table.
Creates instance variables:
ages: sequence of int ages in years
age_pmf: Pmf object
age_cdf: Cdf object
weights: sequence of total weight in ounces
weight_cdf: Cdf object
"""
cumulative._process(table, name)
table.ages = [p.agepreg for p in table.records if p.agepreg != 'NA']
table.age_pmf = _04_Pmf._make_pmf_from_list(table.ages, table.name)
table.age_cdf = _13_Cdf._make_cdf_from_list(table.ages, table.name)
table.weights = [p.totalwgt_oz for p in table.records if p.totalwgt_oz != 'NA']
table.weight_cdf = _13_Cdf._make_cdf_from_list(table.weights, table.name)
def _make_tables(data_dir='.'):
"""Reads survey data and returns a tuple of Tables"""
table, firsts, others = _02_first._make_tables(data_dir)
pool = _06_descriptive._pool_records(firsts, others)
_process(pool, 'live births')
_process(firsts, 'first babies')
_process(others, 'others')
return pool, firsts, others
def _get_age_weight(table, low=0.0, high=20.0):
"""
Get sequences of mother's age and birth weight.
Args:
table: Table object
low: float min weight in pounds
high: float max weight in pounds
Returns:
tuple of sequences (ages, weights)
"""
ages = []
weights = []
for r in table.records:
if r.agepreg == 'NA' or r.totalwgt_oz == 'NA':
continue
if r.totalwgt_oz < low * 16 or r.totalwgt_oz > high * 16:
continue
ages.append(r.agepreg)
weights.append(r.totalwgt_oz)
return ages, weights
def _partition(ages, weights, bin_size=2):
"""
Break ages into bins.
Returns a map from age to list of weights.
"""
weight_dict = {}
for age, weight in zip(ages, weights):
bin = bin_size * math.floor(age / bin_size) + bin_size / 2.0
weight_dict.setdefault(bin, []).append(weight)
for bin, bin_weights in weight_dict.iteritems():
try:
mean = _03_thinkstats._mean(bin_weights)
except ZeroDivisionError:
continue
return weight_dict
def _make_figures(pool, firsts, others):
"""Creates several figures for the book."""
# CDF of all ages
_05_myplot._clf()
_05_myplot._cdf(pool.age_cdf)
_05_myplot._save(root='agemodel_age_cdf',
title="Distribution of mother's age",
xlabel='age (years)',
ylabel='CDF',
legend=False)
# CDF of all weights
_05_myplot._clf()
_05_myplot._cdf(pool.weight_cdf)
_05_myplot._save(root='agemodel_weight_cdf',
title="Distribution of birth weight",
xlabel='birth weight (oz)',
ylabel='CDF',
legend=False)
# plot CDFs of birth ages for first babies and others
_05_myplot._clf()
_05_myplot._cdfs([firsts.age_cdf, others.age_cdf])
_05_myplot._save(root='agemodel_age_cdfs',
title="Distribution of mother's age",
xlabel='age (years)',
ylabel='CDF')
_05_myplot._clf()
_05_myplot._cdfs([firsts.weight_cdf, others.weight_cdf])
_05_myplot._save(root='agemodel_weight_cdfs',
title="Distribution of birth weight",
xlabel='birth weight (oz)',
ylabel='CDF')
# make a scatterplot of ages and weights
ages, weights = _get_age_weight(pool)
pyplot.clf()
# pyplot.scatter(ages, weights, alpha=0.2)
pyplot.hexbin(ages, weights, cmap=matplotlib.cm.gray_r)
_05_myplot._save(root='agemodel_scatter',
xlabel='Age (years)',
ylabel='Birth weight (oz)',
legend=False)
def _difference_in_means(firsts, others, attr):
"""
Compute the difference in means between tables for a given attr.
Prints summary statistics.
"""
firsts_mean = _03_thinkstats._mean(getattr(firsts, attr))
print('First babies, %s, trimmed mean:' % attr, firsts_mean)
others_mean = _03_thinkstats._mean(getattr(others, attr))
print('Other babies, %s, trimmed mean:' % attr, others_mean)
diff = others_mean - firsts_mean
print('Difference in means:', diff)
print
return diff
def _compute_least_squares(ages, weights):
"""
Computes least squares fit for ages and weights.
Prints summary statistics.
"""
# compute the correlation between age and weight
print('Pearson correlation', correlation._corr(ages, weights))
print('Spearman correlation', correlation._spearman_corr(ages, weights))
# compute least squares fit
inter, slope = correlation._least_squares(ages, weights)
print('(inter, slope):', inter, slope)
res = correlation._residuals(ages, weights, inter, slope)
R2 = correlation._coef_determination(weights, res)
print('R^2', R2)
print
return inter, slope, R2
def main(name, data_dir=''):
pool, firsts, others = _make_tables(data_dir)
for table in [pool, firsts, others]:
print(table.name, len(table.records),)
print(len(table.ages), len(table.weights))
# compute differences in mean age and weight
age_diff = _difference_in_means(firsts, others, 'ages')
weight_diff = _difference_in_means(firsts, others, 'weights')
# get ages and weights
ages, weights = _get_age_weight(pool)
# compute a least squares fit
inter, slope, R2 = _compute_least_squares(ages, weights)
# see how much of the weight difference is explained by age
weight_diff_explained = age_diff * slope
print('Weight difference explained by age:', weight_diff_explained)
print('Fraction explained:', weight_diff_explained / weight_diff)
print
# make a table of mean weight for 5-year age bins
weight_dict = _partition(ages, weights)
_make_line_plot(weight_dict)
# the correlations are slightly higher if we trim outliers
ages, weights = _get_age_weight(pool, low=4, high=12)
inter, slope, R2 = _compute_least_squares(ages, weights)
_make_figures(pool, firsts, others)
def _make_line_plot(age_bins):
xs = []
ys = []
for bin, weights in sorted(age_bins.iteritems()):
xs.append(bin)
ys.append(_03_thinkstats._mean(weights))
_05_myplot._plot(xs, ys, 'bs-')
_05_myplot._save(root='agemodel_line',
xlabel="Mother's age (years)",
ylabel='Mean birthweight (oz)',
legend=False)
if __name__ == '__main__':
import sys
main(*sys.argv)