forked from romanlutz/NFLPlayPrediction
/
feat.py
executable file
·344 lines (281 loc) · 16.8 KB
/
feat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
#Load games
from __future__ import division
import nflgame
# Extract features
import re
from collections import defaultdict
import numpy as np
def extract_features(start_year, end_year):
play_features = []
success_labels = []
yard_labels = []
progress_labels = []
success_cnt = 0
for year in range(start_year, end_year + 1):
# split into individual weeks in order to avoid having to load
# large chunks of data at once
for week in range(1, 18):
games = nflgame.games(year, week=week)
for play in nflgame.combine_plays(games):
features = defaultdict(float)
success = 0
yards = 0
progress = 0
pv2 = 0
desc = ''
# TODO: include sacks? probably not since we can't assign them to any play option
# TODO: Additonally maybe even booth review, official timeout?
# TODO: Fumble plays should count as if Fumble didn't happen?
# TODO: plays with declined penalties should be counted ((4:52) A.Foster right tackle to HOU 43 for 13 yards (J.Cyprien). Penalty on JAC-S.Marks, Defensive Offside, declined.)
# TODO: plays with accepted penalties that do not nullify the play should be counted (keyword: No Play)
# TODO: error with group when using 2013
# TODO: Should we count Def. Pass Interference? Def. Holding?
if (play.note == None or play.note == 'TD' or play.note =='INT') \
and (' punt' not in play.desc) \
and ('END ' != play.desc[:4]) \
and ('End ' != play.desc[:4]) \
and ('Two-Minute Warning' not in play.desc) \
and ('spiked the ball to stop the clock' not in play.desc) \
and ('kneels to ' not in play.desc) \
and ('Delay of Game' not in play.desc)\
and (play.time is not None)\
and ('Penalty on' not in play.desc)\
and ('Delay of Game' not in play.desc)\
and ('sacked at' not in play.desc)\
and ('Punt formation' not in play.desc)\
and ('Direct snap to' not in play.desc)\
and ('Aborted' not in play.desc)\
and ('temporary suspension of play' not in play.desc)\
and ('TWO-POINT CONVERSION ATTEMPT' not in play.desc)\
and ('warned for substitution infraction' not in play.desc)\
and ('no play run - clock started' not in play.desc)\
and ('challenged the first down ruling' not in play.desc)\
and ('*** play under review ***' not in play.desc)\
and ('Direct Snap' not in play.desc)\
and ('Direct snap' not in play.desc):
features['team'] = play.team
if play.drive.game.away == play.team:
features['opponent'] = play.drive.game.home
else:
features['opponent'] = play.drive.game.away
timeclock = play.time.clock.split(':')
features['time'] = float(timeclock[0])*60 + float(timeclock[1])
if (play.time.qtr == 1) or (play.time.qtr == 3):
features['time'] += 15*60
if play.time.qtr <= 2:
features['half'] = 1
else:
features['half'] = 2
features['position'] = 50-play.yardline.offset
features['down'] = play.down
features['togo'] = play.yards_togo
if 'Shotgun' in play.desc:
features['shotgun'] = 1
full_desc = play.desc
full_desc = full_desc.replace('No. ','No.')
while (re.search(r" [A-Z]\. ", full_desc) is not None):
match = re.search(r" [A-Z]\. ", full_desc).group(0)
full_desc = full_desc.replace(match,match.rstrip())
if(re.search(r"[^\.] \(Shotgun\)", full_desc) is not None):
full_desc = full_desc.replace(" (Shotgun)",". (Shotgun)")
full_desc = full_desc.replace('.(Shotgun)','. (Shotgun)')
if(re.search(r" a[st] QB for the \w+ ", full_desc) is not None):
match = re.search(r" a[st] QB for the \w+ ", full_desc).group(0)
full_desc = full_desc.replace(match,match.rstrip() + '. ')
if(re.search(r"New QB.{0,20}[0-9]+ \w+?\.w+? ", full_desc) is not None):
match = re.search(r"New QB.{0,20}[0-9]+ \w+?\.w+? ", full_desc).group(0)
full_desc = full_desc.replace(match,match.rstrip() + '. ')
if(re.search(r"New QB.{0,20}[0-9]+ \w+?[\.\, ] ?\w+? ", full_desc) is not None):
match = re.search(r"New QB.{0,20}[0-9]+ \w+?[\.\, ] ?\w+? ", full_desc).group(0)
full_desc = full_desc.replace(match,match.rstrip() + '. ')
if(re.search(r"\#[0-9]+ Eligible ", full_desc) is not None):
match = re.search(r"\#[0-9]+ Eligible ", full_desc).group(0)
full_desc = full_desc.replace(match,match.rstrip() + '. ')
full_desc = full_desc.replace('New QB for Denver - No.6 - Brock Osweiler ','New QB for Denver - No.6 - B.Osweiler. ')
full_desc = full_desc.replace(' at QB ',' at QB. ')
full_desc = full_desc.replace(' at qb ',' at QB. ')
full_desc = full_desc.replace(' at Qb ',' at QB. ')
full_desc = full_desc.replace(' in as QB for this play ',' in as QB for this play. ')
full_desc = full_desc.replace(' in as QB ',' in as QB. ')
full_desc = full_desc.replace(' in as quarterback ',' in as QB. ')
full_desc = full_desc.replace(' in at Quarterback ',' in as QB. ')
full_desc = full_desc.replace(' is now playing ',' is now playing. ')
full_desc = full_desc.replace(' Seminole Formation ',' ')
full_desc = full_desc.replace(' St. ',' St.')
full_desc = full_desc.replace(' A.Randle El ',' A.Randle ')
full_desc = full_desc.replace('Alex Smith ','A.Smith ')
if(re.search(r"New QB \#[0-9]+ \w+?\.\w+? ", full_desc) is not None):
match = re.search(r"New QB \#[0-9]+ \w+?\.\w+? ", full_desc).group(0)
full_desc = full_desc.replace(match,match.rstrip() + '. ')
if(re.search(r"took the reverse handoff from #[0-9]+", full_desc) is not None):
match = re.search(r"took the reverse handoff from #[0-9]+ \S+ ", full_desc).group(0)
full_desc = full_desc.replace(match,match.rstrip() + '. ')
sentences = full_desc.split('. ')
flag = 0
for i in range(len(sentences)):
if ('as eligible (Shotgun) ' in sentences[i]):
sentences[i] = re.sub(r"^.+ \(Shotgun\) ", "", sentences[i]).strip()
if (re.search(r' eligible \S+\.\S+ ', sentences[i]) is not None):
sentences[i] = re.sub(r"^.+ eligible ", "", sentences[i]).strip()
if ' as eligible' in sentences[i]:
continue
if 'was injured during the play' in sentences[i]:
continue
if 'lines up at ' in sentences[i]:
continue
if (re.search(r' at QB$', sentences[i]) is not None):
continue
if ' in at QB' in sentences[i]:
sentences[i] = re.sub(r"^.+ in at QB", "", sentences[i]).strip()
if ' report as eligible' in sentences[i]:
sentences[i] = re.sub(r"^.+ report as eligible", "", sentences[i]).strip()
if ('at QB' in sentences[i]) and ('at WR' in sentences[i]):
#QB and WR switched positions
continue
desc = sentences[i]
desc = re.sub(r"\(.+?\)", "", desc).strip()
desc = re.sub(r"\{.+?\}", "", desc).strip()
if ((re.search(r'to \w+$', desc) is not None) or (re.search(r'^\w+$', desc) is not None)) and (i<len(sentences)-1) and ('respotted to' not in desc):
desc = desc + '.' + re.sub(r"\(.+?\)", "", sentences[i+1]).strip()
if ((i<len(sentences)-1) and (sentences[i+1][:3] == 'to ')):
desc = desc + '.' + re.sub(r"\(.+?\)", "", sentences[i+1]).strip()
if ' at QB' in desc:
desc = ''
continue
if ' eligible' in desc:
desc = ''
continue
if 'Injury update: ' in desc:
desc = ''
continue
if desc.startswith('Reverse') == True:
desc = ''
continue
if desc.startswith('Direction change') == True:
desc = ''
continue
if desc.startswith('Direction Change') == True:
desc = ''
continue
#if (re.search(r'^\S+\.\S+ ', desc) is not None):
#if((' pass ' ) in desc) and ((
if ' pass ' in desc:
if (' short ' in desc) or (' deep' in desc):
if (' left' in desc) or (' right' in desc) or (' middle' in desc):
if (' incomplete ' in desc) or (' for ' in desc) or (' INTERCEPTED ' in desc):
break
else:
if (' up the middle' in desc) or (' left' in desc) or (' right' in desc):
if (' for ' in desc):
break
#print desc
#print full_desc
#print
desc = ''
if desc == '':
continue
if 'incomplete' in desc:
features['pass'] = 1
rematch = re.search(r'incomplete \S+ \S+ to ', desc)
if rematch is None:
# ball just thrown away, no intended target -> ignore
continue;
match = rematch.group(0).split()
features['passlen'] = match[1]
features['side'] = match[2]
else:
if 'no gain' in desc:
yards = 0
else:
if (play.note!='INT') and ('INTERCEPTED' not in desc):
rematch = re.search(r'[-]?[0-9]+ yard\s?', desc)
if rematch is None:
print desc
print play.desc
match = rematch.group(0)
yards = float(match[:match.find(' ')])
if ' pass ' in desc:
features['pass'] = 1
match = re.search(r'pass \S+ \S+', desc).group(0).split()
if match[1] == 'to':
continue
features['passlen'] = match[1]
features['side'] = match[2]
else:
features['pass'] = 0
if 'up the middle' in desc:
features['side'] = 'middle'
else:
rematch = re.search(r'^\S+ (scrambles )?\S+ \S+', desc)
if rematch is None:
print desc
print play.desc
offset = 0
match = rematch.group(0).split()
if match[1] == 'scrambles':
features['qbrun'] = 1
offset = 1
if match[2+offset] == "guard":
features['side'] = 'middle'
else:
features['side'] = match[1+offset]
if (play.note=='INT') or ('INTERCEPTED' in desc) :
success = 0
else:
if (play.touchdown == True) and (' fumble' not in play.desc):
success = 1
success_cnt += 1
elif yards >= play.yards_togo:
success = 1
success_cnt += 1
# progress label calculation
if yards >= play.yards_togo:
# new first down reached
progress == 1
elif (play.down in [1, 2]) and (yards > 0):
progress = (float(yards) / float(play.yards_togo))**play.down
else:
# 3rd or 4th down attempt without conversion
progress = 0
if features['side'] not in ['middle','left','right']:
print play.desc
print
continue
play_features.append(features)
success_labels.append(success)
yard_labels.append(yards)
progress_labels.append(progress)
# Debug information
"""
import random
if random.randint(0,1000) < 2:
print desc
print play.desc
if len(features) == 0:
print '>>> IGNORED PLAY <<<'
else:
print features
print 'SUCCESS:',success,'| YARDS:',yards
print "############################################################"
"""
print len(play_features)
return np.array(play_features), np.array(success_labels), np.array(yard_labels), np.array(progress_labels)
# Encode categorical features
# Returns encoded features and the encoder
def encode_categorical_features(features, sparse=True):
from sklearn.feature_extraction import DictVectorizer
enc = DictVectorizer(sparse=sparse)
enc.fit(features)
svm_features = enc.transform(features)
return svm_features, enc
def get_team_features(team,features, labels,feature_name='team'):
team_features = []
team_labels = []
for i in range(len(features)):
if features[i][feature_name] == team:
f = features[i].copy()
del f[feature_name]
team_features.append(f)
team_labels.append(labels[i])
print len(team_features)
return (np.array(team_features),np.array(team_labels))