def team_array(): '''Returns a dictionary of pd dataframe objects. Filters out the data to only include a specific team. Ex usage: df = team_array() print df['Dallas Cowboys'] ''' table = load.getDataset() teams = get_teams(table) team_keys = get_team_keys() combined = {} for team in team_keys: combined[team] = pd.merge(teams[team + ' home'], \ teams[team + ' away'], how = 'outer') return combined
def write_teams(): table = load.getDataset() teams_table = separate_teams(table) team_keys = teams_table.keys() team_keys = list(set([x[:-5] for x in team_keys])) yes_ser = pd.Series(np.repeat(np.array(['True']), len(table))) no_ser = pd.Series(np.repeat(np.array(['False']), len(table))) for team in team_keys: home_team = teams_table[team + ' home'] away_team = teams_table[team + ' away'] home_team = format_home(home_team) home_team['home_field?'] = yes_ser away_team = format_away(away_team) away_team['home_field?'] = no_ser table_out = pd.merge(home_team, away_team, how = 'outer') dirpath = os.getcwd()[:-10] + '/data/teamdata/' table_out.to_csv(dirpath + team + '.csv')
def validate(): '''basic sanity checks on the numerical parts of the data''' table = load.getDataset() # These are inequalities for team in ['ht_', 'at_']: for elem in [('rush_attempts', 'rush_TDs'), ('pass_attempt', 'pass_comp'), ('pass_comp', 'pass_TDs'), ('fumbles', 'fumbles_lost'), ('3rd_down_attempts', '3rd_down_converted'), ('4th_down_attempts', '4th_down_converted')]: test = table[team + elem[0]] >= table[team + elem[1]] assert(len(set(test)) == 1) # These are equalities for elem in [('net_pass_yards', 'rush_yards', 'total_yards'), ('fumbles_lost', 'INT', 'turnovers')]: test = table[team + elem[0]] + table[team + elem[1]] - \ table[team + elem[2]] assert(len(set(test)) == 1) home_TOP = np.array([int(x.split(':')[0])*60 + \ int(x.split(':')[1]) for x in table['ht_TOP']]) away_TOP = np.array([int(x.split(':')[0])*60 + \ int(x.split(':')[1]) for x in table['at_TOP']]) assert(len(set(home_TOP + away_TOP >= 3595)) == 1 )
def home_away_differences(): ''' Finds the differences between teams and their opponents based on location. Returns a dictionary of numpy arrays. Ex. use: a = home_away_differences() plt.hist(a['Dallas Cowboys home score'], 20) which would plot a hist of how many more points the Cowboys have vs their opponent when the Cowboys are playing at home. a['Dallas Cowboys away score'] would be their stats when they are away Might delete. Too specific of a function ''' table =load.getDataset() all_teams = get_teams(table) team_keys = get_team_keys() columns = list(table.keys()) # ghetto, hardcoded filtering method :( columns = [x[3:] for x in columns if ('at' in x)] columns.remove('endance') columns.remove('TOP') stat_diff = {} for team in team_keys: for stat in columns: for location in [(' home', 'ht_', 'at_'), (' away', 'at_', 'ht_')]: stat_diff[team + location[0] + ' ' + stat] = \ np.array(all_teams[team + location[0]][location[1] + stat]) - \ np.array(all_teams[team + location[0]][location[2] + stat]) return stat_diff
def cum_array(stat): '''Returns a pd series of a particular stat from the table. Input is a string''' return load.getDataset()[stat]
def cum_stat(stat): '''Gives a rough idea of the dsistribution of a stat for all games at a glance. Input is a string''' table = load.getDataset() return table.describe()[stat]
def get_team_keys(): '''gets a list of all teams''' table = load.getDataset() teams = list(set(list(table['home_team'].values) + list(table['away_team'].values))) teams = [x.strip() for x in teams] return teams