/
Exploratory_data_analysis_in_python.py
372 lines (279 loc) · 11.4 KB
/
Exploratory_data_analysis_in_python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
"""Exploratory Data Analysis in Python"""
"""Chapter 1 - Read, Clean and Validate"""
# Dataframes and series
# Qu: What is the average birth weight of babies in the US?
# - Find the appropriate data
# - Read data in your development environment
# - Clean and validate
# Reading data
import pandas as pd
nsfg = pd.read_hdf('nsfg.hdf5', 'nsfg')
type(nsfg)
nsfg.shape # Attribute, no of rows and columns
nsfg.columns # List of variable names
# Clean and validate
pounds.value_counts().sort_index() # Can see what values appear and sort by value
pounds.describe() # Computes summary statistics
# Replace
pounds = pounds.replace([98,99], np.nan, in_place=True) # Replace lists of values
ounces.replace([98,99], np.nan, in_place=True) # Replace lists of values, without making a copy (don't need to reassign)
# Combine into single series
birth_weight = pounds + ounces / 16.0
# Filter and visualise
# Pyplot doesn't work with nans, so have to dropna
import matplotlib as plt
plt.hist(birth_weight.dropna(), bins=30)
preterm = nsfg['prglngth'] < 37 # Returns boolean
# Filtering
preterm_weight = birth_weight[preterm] # Can use ~ for not
# Can use logical operators to combine two boolean series
# & = and
# | = or
birth_weight[A & B] # Both true
birth_weight[A | B] # either or both true
# Resampling
# Some groups may be 'oversampled'
# We can correct by using: resample_rows_weighted()
# Probability mass functions
# GSS - General Social Survey
educ = gss['educ']
# PMF - probabiity mass function
# Contains unique values in series and how often they appear
pmf_educ = Pmf(educ, normalize=False) # If normalise=True, frequencies add up to one
pmf_educ.head() # Shows values on left and counts on right
pmf_educ[12] # Look up 12 years education
# Bar chart of pmf better than hist - shows all unique values
# Cumulative distribution functions
# Represents possible values in a distribution and their probabilities
# From PMF to CDF
# - If you draw a random element from a distribution:
# - PMF is the probability that you get exactly x
# - CDF is the probability that you get a value less than or equal to x
# Substitute Cdf for Pmf
# CDF is an invertible function
cdf = Cdf(gss['educ'])
p = 0.25
q = cdf.inverse(p)
print(q)
# - IQR - based on percentiles, doesn't get thrown off by extreme values or outliers, unlike variance
# Sometimes more robust than variance
# Comparing distributions
# Can plot multiple PMFs on the same axes
sex = gss['sex'] == 1
age = gss['age']
male_age = age[male]
female_age = age[~male]
Pmf(male_age).plot(label='Male')
Pmf(female_age).plot(label='Female')
plt.xlabel('Age (years)')
plt.ylabel('Count')
plt.show()
# Multiple CDFs
Cdf(male_age).plot(label='Male')
Cdf(female_age).plot(label='Female')
plt.xlabel('Age (years)')
plt.ylabel('Count')
plt.show()
# CDFs are smoother in general - can give a better view of real differences in distributions
# Income distribution
income = gss['realinc']
pre95 = gss['year'] < 1995
Pmf(income[pre95]).plot(label="Before 1995")
Pmf(income[~pre95]).plot(label="After 1995")
# CDF will give clearer picture - good for exploratory analysis
# Modeling distributions
# The normal distribution
sample = np.random.normal(size=1000)
Cdf(sample).plot()
# Produces sigmoid shaped distribution
# Scipy provides object called norm that represents the normal distribution
from scipy.stats import norm
xs = np.linspace(-3, 3) # Creates an array of equally spaced points from -3 to 3
ys = norm(0, 1).\ # Creates an object the represents a normal distribution with mean 0 and std 1
cdf(xs) # Evaluates the CDF of the normal distribution
plt.plot(xs, ys, color='gray')
Cdf(sample).plot
# Want to compare the CDF to a normal distribution to see whether its a good fit
# The bell curve
xs = np.linspace(-3, 3)
ys = norm(0, 1).pdf(xs)
plt.plot(xs, ys, color='gray')
# Comparing the bell curve vs. the pmf won't work
# Kernel density estimation - Can use the points in the sample to estimate the PDF of the distribution they came from
# Getting from pmf (probability mass function) to pdf (probability density function)
# KDE plot
import seaborn as sns
sns.kdeplot(sample)
# Can compare the KDE plot and the normal pdf
xs = np.linspace(-3, 3)
ys = norm.pdf(xs)
plt.plot(xs, ys, color='gray')
sns.kdeplot(sample)
# PDF is a more sensitive way to look for differences, but often too sensitive
# Use CDFs for exploration - good for distracting from noise, but not well known
# Use PMFs if there are a small number of unique values
# Use KDE if there are a lot of values
# In many datasets, distribution of income is lognormal - i.e. log of incomes fits a normal distribution
"""Chapter 3 - Relationships"""
# Exploring relationships
# BRFSS - Behavioural Risk Factor Surveillance System - CDC data
# Use random sub-sample of 100,000
# Scatter plot
brfss = pd.read_hdf('brfss.hdf5', 'brfss')
height = brfss['HTM4']
weight = brfss['WTKG3']
# Faster to use plot with the format string 'o'
plt.plot(height, weight, 'o') # Plots a circle for each dp
plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.show()
# Overplotted - where datapoints are piled on top of each other - misleading results
# Can improve with transparency - alpha value
plt.plot(height, weight, 'o', alpha=0.02) # Plots a circle for each dp
plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.show()
# Can make markers smaller
plt.plot(height, weight, 'o', alpha=0.02, markersize=1) # Reduce marker size
plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.show()
# Can add random noise to values = jittering
height_jitter = height + np.random.normal(0, 2, size=len(brfss))
weight_jitter = weight + np.random.normal(0, 2, size=len(brfss))
plt.plot(height_jitter, weight_jitter, 'o', markersize=1, alpha=0.02)
plt.show()
# Zoom
plt.plot(height_jitter, weight_jitter, 'o', markersize=1, alpha=0.02)
plt.axis([140, 200, 0, 160]) # x range, y range
plt.show()
# Visualising relationships
# Violin plot - estimates the KDE plot for each column (within each group)
# Need to get rid of missing data before you can use it
data = brfss.dropna(subset=['AGE'],['WTKG3'])
sns.violinplot(x='AGE', y='WTKG3', data=data, inner=None) # Inner = None Simplifies the plot slightly
plt.show()
# Each column - graphical representation of the distribution of weight in one age group
# Width - proportional to the estimated density - two vertical PDFs printed back to back
# Box plot
sns.boxplot(x='AGE', y='WTKG3', data=data, whis=10) # Whis = 10, turns of feature don't need
plt.show()
# Each box - IQR
# Middle line - median
# Spine - min / max
# With data skewed towards higher values - sometimes useful to look at on logarithmic scale
# Can use pyplot function yscale
sns.boxplot(x='AGE', y='WTKG3', data=data, whis=10) # Whis = 10, turns of feature don't need
plt.yscale('log')
plt.show()
# Correlation
# Correlation coefficient - -1 to 1 - quantifies strength of a linear relationship
# .corr() - result: correlation matrix
# If correl is non-linear, .corr() will generally underestimate the strength of the relationship
# Generate fake data
xs = np.linspace(-1, 1) #Equally spaced points
ys = xs**2
ys += normal(0, 0.05, len(xs)) # x^2 + random noise
# Correl says nothing about slope
# Correl - can use one to predict the other
# Statistic we care about - the slop of the line
# Simple regression
# Strength of effect
from scipy.stats import linregress
# Hypothetical 1
res = linregress(xs, ys)
# Result - lin regress result object
# Slope - slope of the line of best fit
# Intercept - intercept
# rvalue - correlation
# Plotting the line of best fit - only works for linear relationships
fx = np.array([xs.min(), xs.max()]) # Take the min and max
fy = res.intercept + res.slope * fx
plt.plot(fx, fy, '-')
"""Chapter 4 - Multivariate thinking"""
# Limits of simple regression
# Regression is not symmetric
# Different because make different assumptions
# x = known quantity
# y = random
# Regression is not causation
# Multiple regression
# Scipy doesn't do multiple regression
# Switch to statsmodels
import statsmodels.formula.api as smf
results = smf.ols('INCOME2 ~ _VEGESU1', data=brfss)\ # First arg - formula string, income as a function of veggie cons
.fit() # Run .fit() to get results
results.params # Contains slope and intercept
# Multiple regression
gss = pd.read_hdf('gss.hdf5', 'gss')
results = smf.ols('realinc ~ educ', data=gss).fit() # realinc - trying to predict, using educ
results.params
# Adding age
gss = pd.read_hdf('gss.hdf5', 'gss')
results = smf.ols('realinc ~ educ + age', data=gss).fit() # realinc - trying to predict, using educ and age
results.params
# Income and age
grouped = gss.groupby('age') # Result - group by object, one group for each value of age
# Behaves like a dataframe
mean_income_by_age = grouped['realinc'].mean() # Pandas series with mean income for each age group
plt.plot(mean_income_by_age, 'o', alpha=0.5)
plt.xlabel('Age (years)')
plt.ylabel('Income (1986 $)')
# Age and income have a non-linear relationship
# Adding a quadratic term
gss['age2'] = gss['age']**2
model = smf.ols('realinc ~ educ + age + age2', data=gss)
results = model.fit()
results.params
# Visualising regression results
# Generating preductions
df = pd.DataFrame()
df['age'] = np.linspace(18, 85)
df['age2'] = df['age'] ** 2
df['educ'] = 12
df['educ2'] = df['educ'] ** 2
pred12 = results.predict(df) # Use results to predict average income for each age group holding education constant
# Result - series, one prediction for each row
plt.plot(df['age'], pred12, label="High school")
plt.plot(mean_income_by_age, '0', alpha=0.5) # Plot of comparison data, avg income in each age group
plt.xlabel('Age (years)')
plt.ylabel('Income (1986 $)')
plt.legend()
# Can repeat for different levels of eduction, e.g. Associates degree, Batchelors degree
# Can help validate the model, can compare predictions against the data
# Logistic regression
# Categorical variables - e.g. sex, race
# Including as part of a regression - "C" indicates categorical variable
formula = 'realinc ~ educ + educ2 + age + age2 + C(sex)'
results = smf.old(formula, data=gss).fit()
results.params
# For cat variable - indicates the difference between the default and the other variable
# If only two values - boolean variable
# Variables need to be recoded so that 1 means yes and 0 means no
# E.g.
gss['gunlaw'].replace([2], [0], inplace=True)
# Logistic regression
formula = 'gunlaw ~ age + age2 + educ + educ2 + C(sex)'
results = smf.logit(formula, data=gss).fit()
# Params are in the form of log odds
# >0, make outcome more likely
# <0, make outcome less likely
# Generate predictions
df = pd.DataFrame()
df['age'] = np.linspace(18, 89)
df['educ'] = 12
df['age2'] = df['age'] ** 2
df['educ2'] = df['educ'] ** 2
df['sex'] = 1 ## Generates predictions for men
pred1 = results.predict(df)
df['sex'] = 0 ## Generates predictions for women
pred2 = results.predict(df)
# Visualising results
grouped = gss.groupby('age')
favor_by_age = grouped['gunlaw'].mean()
plt.plot(favor_by_age, 'o', alpha=0.5)
plt.plot(df['age'], pred1, label='Male')
plt.plot(df['age'], pred2, label='Female')
plt.xlabel('Age')
plt.ylabel('Probability of favoring gun law')
plt.legend()