/
CFB_model.py
376 lines (322 loc) · 19.5 KB
/
CFB_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
import pandas as pd
from sklearn.linear_model import LassoCV, RidgeCV
import numpy as np
from datetime import datetime
class CFB:
"""
Uses machine learning algorithms to simulate college football DI-A matchups and corresponding spreads.
Class takes two variables, year1 and year2, which correspond to the range of years used as a basis for
model input data gathering. Data sourced from https://www.sports-reference.com/cfb/.
"""
# Dictionary of current conferences and teams in conference for all D-1A, as of 2019 season. The teams
# are listed in the format the model accepts.
current_conf = \
{
'acc':['Virginia Tech', 'Boston College', 'Georgia Tech', 'Florida State',
'Clemson', 'Pitt', 'Louisville', 'North Carolina',
'Syracuse', 'Virginia', 'Duke', 'Miami (FL)',
'North Carolina State', 'Wake Forest'],
'american':['UCF', 'Memphis', 'Tulsa', 'East Carolina', 'Navy',
'Cincinnati', 'SMU', 'Tulane', 'Connecticut',
'South Florida', 'Houston', 'Temple'],
'big-12':['Oklahoma', 'Kansas', 'Texas Christian', 'Texas Tech',
'Kansas State', 'Baylor', 'Oklahoma State', 'Texas', 'Iowa State',
'West Virginia'],
'big-ten':['Northwestern', 'Rutgers', 'Penn State', 'Minnesota', 'Nebraska',
'Michigan State', 'Wisconsin', 'Indiana', 'Michigan', 'Ohio State',
'Purdue', 'Illinois', 'Iowa', 'Maryland'],
'cusa':['Florida International', 'Old Dominion', 'Rice', 'Charlotte',
'Louisiana Tech', 'Marshall', 'Florida Atlantic', 'UTEP',
'Middle Tennessee State', 'UAB', 'North Texas',
'UTSA', 'Western Kentucky', 'Southern Mississippi'],
'independent':['Liberty','Army','Massachusetts','Notre Dame','Brigham Young','New Mexico State'],
'mac':['Eastern Michigan', 'Central Michigan', 'Miami (OH)', 'Kent State',
'Northern Illinois', 'Western Michigan', 'Buffalo', 'Ohio',
'Bowling Green State', 'Ball State', 'Akron', 'Toledo'],
'mwc':['Hawaii', 'Air Force', 'Boise State', 'Nevada', 'Colorado State',
'Fresno State', 'Wyoming', 'New Mexico', 'San Diego State',
'UNLV', 'San Jose State', 'Utah State'],
'pac-12':['USC', 'Washington', 'Washington State', 'Arizona',
'Stanford', 'Arizona State', 'UCLA', 'Oregon State', 'Oregon',
'Utah', 'California', 'Colorado'],
'sec':['Georgia', 'Kentucky', 'Alabama', 'LSU', 'Texas A&M',
'Arkansas', 'Florida', 'Mississippi State', 'South Carolina',
'Tennessee', 'Ole Miss', 'Vanderbilt', 'Auburn', 'Missouri'],
'sun-belt':['Texas State', 'Coastal Carolina', 'Troy', 'South Alabama',
'Arkansas State', 'Louisiana-Monroe', 'Georgia State', 'Louisiana',
'Appalachian State', 'Georgia Southern']
}
# All conferences existing from 2000 to 2019, with abbreviations used to convert from rank
# dataframe to URL input.
hist_conf = \
{
'Southeastern Conference':'sec','Big East Conference':'big-east',
'Atlantic Coast Conference':'acc','Big 12 Conference':'big-12',
'American Athletic Conference':'american',
'Pacific-10 Conference':'pac-10','Pacific-12 Conference':'pac-12',
'Big Ten Conference':'big-ten','Mountain West Conference':'mwc',
'Independent':'independent','Western Athletic Conference':'wac',
'Conference USA':'cusa','Mid-American Conference':'mac',
'Sun Belt Conference':'sun-belt','Big West Conference':'big-west'
}
# Used to map schedule team names to stat team names.
team_mapping = {'Louisiana State':'LSU','Southern Methodist':'SMU','Southern California':'USC',
'Central Florida':'UCF','Pittsburgh':'Pitt','Mississippi':'Ole Miss',
'Alabama-Birmingham':'UAB','Texas-San Antonio':'UTSA','Texas-El Paso':'UTEP',
'Nevada-Las Vegas':'UNLV'}
def __init__(self,year1,year2):
"""Initializes class inputs, range of years for model input."""
self.year1 = year1
self.year2 = year2
def CFB_stats(self,conf):
"""
Coverts team offense and team defense datasets provided by Sports Reference into usable
Pandas dataframes. Uses year1 as basis for data gathering.
"""
# Convert PAC-12 to PAC-10 prior to 2011 for team offense. Then generate correct URL.
if conf == 'pac-12' and self.year1 < 2011:
url_off = 'https://www.sports-reference.com/cfb/conferences/pac-10/{}-team-offense.html'.format\
(self.year1)
else:
url_off = 'https://www.sports-reference.com/cfb/conferences/{}/{}-team-offense.html'.format\
(conf,self.year1)
# Request data using URL, join 0 and 1 level columns labels and format properly, drop games and points
# columns, and add 'Off' to beginning of column names to delineate offensive stats. Stats already
# averaged per game, so did not average by game column.
off = pd.read_html(url_off)[0].iloc[:,1:]
off.columns = [' '.join(col) for col in off.columns.values]
for col in off.columns[:3]:
off.rename(columns={col:col.split('0')[-1].strip()},inplace=True)
off.drop(columns=["G",'Pts'],inplace=True)
for col in off.columns[1:]:
off.rename(columns={col:'Off {}'.format(col)},inplace=True)
# Again convert PAC-12 to PAC-10 for years prior to 2011 and create URL.
if conf == 'pac-12' and self.year1 < 2011:
url_def = 'https://www.sports-reference.com/cfb/conferences/pac-10/{}-team-defense.html'.format\
(self.year1)
else:
url_def = 'https://www.sports-reference.com/cfb/conferences/{}/{}-team-defense.html'.format\
(conf,self.year1)
# Perform same data cleaning as with team offense, and similarly add 'Def' before column name to
# delineate defensive stats.
defense = pd.read_html(url_def)[0].iloc[:,1:]
defense.columns = [' '.join(col) for col in defense.columns.values]
for col in defense.columns[:3]:
defense.rename(columns={col:col.split('0')[-1].strip()},inplace=True)
defense.drop(columns=["G",'Pts'],inplace=True)
for col in defense.columns[1:]:
defense.rename(columns={col:'Def {}'.format(col)},inplace=True)
# Merge offensive and defensive stats to form all stats for that year and conference, and fill
# all na values with 0.
all_stats = pd.merge(left=off,right=defense,on="School")
all_stats = all_stats.fillna(0)
return all_stats
def schedule(self):
"""Generates CFB DI-A schedule from given year (year1) using Sports Reference."""
# Format URL to retrieve schedule from year1, read url, and select relevent columns.
# Unnamed column indicates '@' for away or is nan.
url = 'https://www.sports-reference.com/cfb/years/{}-schedule.html'.format(self.year1)
schedule = pd.read_html(url)[0]
if self.year1 > 2012:
schedule = schedule[['Wk','Date','Winner','Pts','Unnamed: 7','Loser','Pts.1']]
else:
schedule = schedule[['Wk','Date','Winner','Pts','Unnamed: 6','Loser','Pts.1']]
# Name columns accordingly, and drop all non-numeric columns with labels.
schedule.columns = ['Week', 'Date','Winner','Winner Pts','Away/Home','Loser','Loser Pts']
schedule = schedule[schedule['Winner'] != 'Winner'].copy()
def strip_rank(team):
"""Strip ranking from team name."""
if '(OH)' in team:
team = 'Miami (OH)'
elif '(FL)' in team:
team = 'Miami (FL)'
elif '(' in team:
team = team.split(')')[-1]
team = team.replace(u'\xa0', u'')
return team
# Strip rank from all teams.
schedule['Winner'] = schedule['Winner'].apply(strip_rank)
schedule['Loser'] = schedule['Loser'].apply(strip_rank)
# Initialize Away and Home columns, as well as home spread.
schedule['Away'] = schedule['Winner']
schedule['Home'] = schedule['Loser']
schedule['Home Spread'] = schedule['Winner Pts'].apply(float) - schedule['Loser Pts'].apply(float)
# Chose home and away teams based on Away/Home column. Adjust teams if not in same order as
# winner and loser.
for i in range(len(schedule)):
if schedule.iloc[i,4] == '@':
continue
schedule.iloc[i,7] = schedule.iloc[i,5]
schedule.iloc[i,8] = schedule.iloc[i,2]
schedule.iloc[i,9] = -schedule.iloc[i,9]
# Choose relevent columns, convert date to year, and drop games with empty values.
schedule = schedule[['Week','Date','Away','Home','Home Spread']]
schedule = schedule.replace('',np.nan)
schedule['Year'] = [self.year1 for x in range(len(schedule))]
schedule.drop(columns='Date',inplace=True)
return schedule
def ratings(self):
"""Obtains SRS ratings for each team in given year."""
# Format URL, retrieve data, set columns, and filter out blank and heading rows.
url = 'https://www.sports-reference.com/cfb/years/{}-ratings.html'.format(self.year1)
ratings = pd.read_html(url)[0]
ratings.columns = ratings.columns.get_level_values(1)
ratings = ratings[ratings['SRS'] != 'SRS']
ratings = ratings[ratings['W'] != 'Overall']
ratings = ratings[['School','SRS']]
# Strip school names, convert SRS to float value, and set school as index.
ratings['School'] = ratings['School'].apply(lambda x: x.strip())
ratings['SRS'] = ratings['SRS'].apply(float)
ratings.set_index('School',inplace=True)
# UNLV not in same format. Correct for format.
ratings.rename(index={'Nevada-Las Vegas':'UNLV'},inplace=True)
return ratings
def rank(self):
"""Obtains ranking of conferences based on Sports Reference ranking. Used to obtain conference names."""
# Request ranking for given year, isolate conference and ranking, and reformat conference into
# abbrev format.
url = 'https://www.sports-reference.com/cfb/years/{}.html'.format(self.year1)
rank = pd.read_html(url)[0]
rank.columns = rank.columns.get_level_values(1)
rank = rank[['Rk','Conference']]
rank['Conference'] = rank['Conference'].apply(lambda x: CFB.hist_conf[x])
return rank
def data_input(self):
"""Creates input data for model using given year range."""
# Initialize dataframe for all game data
all_games = pd.DataFrame()
# Loop through all years in range. Obtain schedule, rating, and conference ranking for given year.
for year in range(self.year1,self.year2 + 1):
# Skip year 2017, as some issues exist with ratings. Discovered issue on Nov 6, 2019.
if year == 2017:
print("Skipping year 2017, as ratings are unavailable.")
continue
schedule = CFB(year,self.year2).schedule().drop(columns='Week').dropna(axis=0)
ratings = CFB(year,self.year2).ratings()
rank = CFB(year,self.year2).rank()
# Inititialize conference stats and loop through conferences, obtaining stats for each conference
# and ratings for each school. SAS ratings used to differentiate strength of schedule and opponent.
all_conf_stats = pd.DataFrame()
for conf in rank['Conference']:
stats = CFB(year,self.year2).CFB_stats(conf)
stats['Rating'] = stats['School'].apply(lambda x: ratings.loc[x,'SRS'])
# Concatenate stats from each conference.
all_conf_stats = pd.concat([all_conf_stats,stats])
# Compile away and home stats, and merge the two. Indicate when given year's data is compiled.
away_stats = pd.merge(schedule,all_conf_stats,left_on='Away',right_on='School')
total_stats = pd.merge(away_stats,all_conf_stats,left_on='Home',right_on='School')
all_games = pd.concat([all_games,total_stats])
print('Data from {} compiled'.format(year))
# Rename columns of dataframe to include home and away. Drop indications of school and year to
# anonymize data.
for col in all_games.columns:
if col[-1] == 'x':
all_games.rename(columns={col: 'Away {}'.format(col.split('_')[0])},inplace=True)
elif col[-1] == 'y':
all_games.rename(columns={col: 'Home {}'.format(col.split('_')[0])},inplace=True)
all_games.drop(columns=['Home Away','Home','Away School','Home School','Year'],inplace=True)
return all_games
def pred_input(self,pred_year,games):
"""
Creates data for models to be used as basis for prediction.
Based on the prediction games and the matchups. Takes list of lists as input for games.
"""
# Initialize dataframe for storing current year stats on teams playing. Also obtain ratings and
# initialize number of game to update user on progress.
all_games = pd.DataFrame()
ratings = CFB(pred_year,self.year2).ratings()
game_num = 1
# Loop through each game, first determining away and home conference stats.
for game in games:
away_conf = [k for k,v in CFB.current_conf.items() if game[0] in v][0]
home_conf = [k for k,v in CFB.current_conf.items() if game[1] in v][0]
away_conf_stats = CFB(pred_year,self.year2).CFB_stats(away_conf)
home_conf_stats = CFB(pred_year,self.year2).CFB_stats(home_conf)
# Determine stats of teams by extracting these teams from their respective conference.
# Format stats using home and away. Detemine rating for each team.
away_stats = away_conf_stats[away_conf_stats['School'] == game[0]]\
.copy().reset_index().drop(columns='index')
away_stats['Rating'] = away_stats['School'].apply(lambda x: ratings.loc[x,'SRS'])
for col in away_stats.columns:
away_stats.rename(columns={col: 'Away {}'.format(col)},inplace=True)
home_stats = home_conf_stats[home_conf_stats['School'] == game[1]]\
.copy().reset_index().drop(columns='index')
home_stats['Rating'] = home_stats['School'].apply(lambda x: ratings.loc[x,'SRS'])
for col in home_stats.columns:
home_stats.rename(columns={col: 'Home {}'.format(col)},inplace=True)
# Concatenate home and awat stats and drop school names, so that data matches model input.
total = pd.concat([away_stats,home_stats],axis=1)
total.drop(columns=['Away School','Home School'],inplace=True)
# Concatenate game data with all games and indicate to user that game is compiled.
all_games = pd.concat([all_games,total])
print('Game {} compiled'.format(game_num))
game_num += 1
return all_games
def create_model(self,pred_year,games):
"""
Generates spread predictions based on model input in given year range,
year of prediction, and games to be predicted.
"""
# Check for proper ranges of years given.
if self.year1 < 2000:
return 'Data not available before 2000.'
elif self.year2 <= self.year1:
return 'Year 2 must be greater than year 1.'
elif self.year2 >= pred_year:
return 'Year 2 must be less than prediction year.'
elif pred_year > datetime.now().year:
return 'Prediction year must be less than or equal to current year.'
# Determine if all games are in DI-A, and are in proper format.
# Refer to current_conf for teams available.
for game in games:
for team in game:
if team not in [x for k,v in CFB.current_conf.items() for x in v]:
return '{} either not D1-A team or not in proper format.'.format(team)
# Generate input values for model. Set X as everything excluding spread, and y as spread.
input_values = CFB(self.year1,self.year2).data_input()
X = input_values.iloc[:,1:]
y = input_values['Home Spread']
# Generate models, with 5 folds, set max_iter to 10000 for lasso, and fit to data.
lasso_mod = LassoCV(cv=5,max_iter=10000).fit(X,y)
ridge_mod = RidgeCV(cv=5).fit(X,y)
# Generate values for generating predictions, and create predictions.
pred_values = CFB(self.year1,self.year2).pred_input(pred_year,games)
lasso_pred = lasso_mod.predict(pred_values)
ridge_pred = ridge_mod.predict(pred_values)
# Create result dictionary, indicating home and away teams, predicted winners, and spread.
results = {'Away':[x[0] for x in games],'Home':[x[1] for x in games],
'Lasso Predicted Winner': [games[i][0] if lasso_pred[i] > 0 else games[i][1]
for i in range(len(games))],
'Ridge Predicted Winner': [games[i][0] if ridge_pred[i] > 0 else games[i][1]
for i in range(len(games))],
'Lasso Spread':[-abs(round(x,1)) for x in lasso_pred],
'Ridge Spread':[-abs(round(x,1)) for x in ridge_pred]}
# Create dataframe based on dictionary, create index, and save as csv.
results = pd.DataFrame(results)
index = pd.Index(['Game {}'.format(num) for num in range(1,len(games) + 1)])
results.index = index
results.to_csv('CFB_games_results.csv')
return results
def get_week(self,pred_year,week):
"""Generates all model outputs for a given week."""
# Adjust weeks to match CFB reference and create schedule.
week += 1
schedule = CFB(pred_year,2019).schedule()
# Check if week in possible range. Also retrieve only games of given week.
if week not in range(1, int(schedule['Week'].iloc[-1]) + 1):
return "Week not in possible range. Must be between 1 and {}.".format(schedule['Week'].iloc[-1])
week_sched = schedule[schedule['Week'] == str(week)]
# Create list of games in schedule to input into model.
games = []
for i in range(len(week_sched)):
game = [week_sched.iloc[i,1], week_sched.iloc[i,2]]
# Map proper names to transform into correct format for model input. Check if game is in DI-A
for i in range(2):
if game[i] in CFB.team_mapping.keys():
game[i] = CFB.team_mapping[game[i]]
if any([team not in [x for k,v in CFB.current_conf.items() for x in v] for team in game]):
continue
games.append(game)
# Run model to obtain results.
results = CFB(self.year1, self.year2).create_model(pred_year,games)
return results