/
sabermetrics-intro.py
99 lines (72 loc) · 3.46 KB
/
sabermetrics-intro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""
Source code for my blog post: Baseball Analytics - An Introduction to Sabermetrics using Python
__link__: http://adilmoujahid.com/posts/2014/07/baseball-analytics/
__author__ : Adil Moujahid
__website__: http://adilmoujahid.com
"""
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
from matplotlib.ticker import FuncFormatter
def millions(x, pos):
'The two args are the value and tick position'
return '$%1.1fM' % (x*1e-6)
formatter = FuncFormatter(millions)
def plot_spending_wins(teams, year):
teams_year = teams.xs(year)
fig, ax = plt.subplots()
for i in teams_year.index:
if i == 'OAK':
ax.scatter(teams_year['salary'][i], teams_year['W'][i], color="#4DDB94", s=200)
ax.annotate(i, (teams_year['salary'][i], teams_year['W'][i]),
bbox=dict(boxstyle="round", color="#4DDB94"),
xytext=(-30, 30), textcoords='offset points',
arrowprops=dict(arrowstyle="->", connectionstyle="angle,angleA=0,angleB=90,rad=10"))
elif i == 'NYA':
ax.scatter(teams_year['salary'][i], teams_year['W'][i], color="#0099FF", s=200)
ax.annotate(i, (teams_year['salary'][i], teams_year['W'][i]),
bbox=dict(boxstyle="round", color="#0099FF"),
xytext=(30, -30), textcoords='offset points',
arrowprops=dict(arrowstyle="->", connectionstyle="angle,angleA=0,angleB=90,rad=10"))
elif i == 'BOS':
ax.scatter(teams_year['salary'][i], teams_year['W'][i], color="#FF6666", s=200)
ax.annotate(i, (teams_year['salary'][i], teams_year['W'][i]),
bbox=dict(boxstyle="round", color="#FF6666"),
xytext=(-30, 30), textcoords='offset points',
arrowprops=dict(arrowstyle="->", connectionstyle="angle,angleA=0,angleB=90,rad=10"))
else:
ax.scatter(teams_year['salary'][i], teams_year['W'][i], color="grey", s=200)
ax.xaxis.set_major_formatter(formatter)
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=15)
ax.set_xlabel('Salaries', fontsize=20)
ax.set_ylabel('Number of Wins' , fontsize=20)
ax.set_title('Salaries - Wins: '+ str(year), fontsize=25, fontweight='bold')
plt.show()
def main():
teams = pd.read_csv('../data/Teams.csv')
teams = teams[teams['yearID'] >= 1985]
teams = teams[['yearID', 'teamID', 'Rank', 'R', 'RA', 'G', 'W', 'H', 'BB', 'HBP', 'AB', 'SF', 'HR', '2B', '3B']]
teams = teams.set_index(['yearID', 'teamID'])
salaries = pd.read_csv('../data/Salaries.csv')
salaries_by_yearID_teamID = salaries.groupby(['yearID', 'teamID'])['salary'].sum()
teams = teams.join(salaries_by_yearID_teamID)
plot_spending_wins(teams, 2001)
teams['BA'] = teams['H']/teams['AB']
teams['OBP'] = (teams['H'] + teams['BB'] + teams['HBP']) / (teams['AB'] + teams['BB'] + teams['HBP'] + teams['SF'])
teams['SLG'] = (teams['H'] + teams['2B'] + (2*teams['3B']) + (3*teams['HR'])) / teams['AB']
#First Model
runs_reg_model1 = sm.ols("R~OBP+SLG+BA",teams)
runs_reg1 = runs_reg_model1.fit()
#Second Model
runs_reg_model2 = sm.ols("R~OBP+SLG",teams)
runs_reg2 = runs_reg_model2.fit()
#Third Model
runs_reg_model3 = sm.ols("R~BA",teams)
runs_reg3 = runs_reg_model3.fit()
print runs_reg1.summary()
print runs_reg2.summary()
print runs_reg3.summary()
if __name__ == '__main__':
main()