def make_hist(): """ Makes a histogram of the at-home ppg data. The histogram contains lines for the at-home ppg average and for the total ppg average. """ ppgs = get_data('data/home_ppg.dat') average = np.mean(ppgs) totalppgs = get_data('data/total_ppg.dat') totalppg_avg = np.mean(totalppgs) f, (ax1, ax2) = plt.subplots(2, sharex=True) ax1.hist(totalppgs, bins=12, color='gray') ax1.plot([totalppg_avg, totalppg_avg], [0, 700], lw=3, color='g') ax1.text( totalppg_avg+1, 620, "total mean=%1.3f" % totalppg_avg, size='large') ax1.text(-3, 520, "all games", rotation=90, size='large') ax1.set_yticks([]) ax2.hist(ppgs, bins=12, color='gray') ax2.plot([average, average], [0, 350], lw=3, color='b') ax2.text(average+1, 300, "home mean=%1.3f" % average, size='large') ax2.text(-3, 270, "home games", rotation=90, size='large') plt.yticks([]) plt.xlabel('points per game', size='large') plt.show()
def plot_data(years, stat, win_type): data = get_data('data/%s_top_%s.dat' % (years, stat)) plt.figure(1) min_x, max_x = sys.maxint, 0 min_y, max_y = sys.maxint, 0 for datum in data: name, value, passes_filter, home_wins, total_wins = datum if win_type == 'home': wins = home_wins elif win_type == 'total': wins = total_wins else: print "Warning: invalid win_type. Defaulted to 'home'." wins = home_wins if passes_filter: color = 'k' else: color = 'r' plt.text(wins, value, name, size='large', color=color) min_x, max_x = min(min_x, wins), max(max_x, wins) min_y, max_y = min(min_y, value), max(max_y, value) plt.xlim(min_x - max_x/10.0, max_x + max_x/10.0) plt.ylim(min_y - max_y/10.0, max_y + max_y/10.0) # X and Y labels plt.ylabel(stat, size='large') plt.xlabel('# of %s wins' % win_type, size='large') plt.show()
def make_boxplot(): """ Makes three boxplots for home, away, and total ppgs. """ home = get_data('data/home_ppg.dat') away = get_data('data/away_ppg.dat') total = get_data('data/total_ppg.dat') ax = plt.subplot(111) plt.boxplot([home, away, total], vert=False, widths=0.8) plt.yticks([]) plt.text(-3, 3.3, "all games", rotation=90, size='large') plt.text(-3, 2.3, "away games", rotation=90, size='large') plt.text(-3, 1.3, "home games", rotation=90, size='large') plt.xlabel("points per game") plt.xlim(-1, 70) ax.xaxis.label.set_fontsize(20) plt.show()
def plot_data_comparison(years_x, stat_x, years_y, stat_y, square): data_x = get_data('data/%s_top_%s.dat' % (years_x, stat_x)) data_y = get_data('data/%s_top_%s.dat' % (years_y, stat_y)) plt.figure(1) min_x, max_x = sys.maxint, 0 min_y, max_y = sys.maxint, 0 data = {} # Links a team with their stat_x. for datum in data_x: name, value = datum[0], datum[1] data[name] = [value] # Links a team with their stat_y. for datum in data_y: for datum in data_y: name, value = datum[0], datum[1] data[name].append(value) # Plot for team in data: x_val, y_val = data[team] plt.text(x_val, y_val, team, size='large') min_x, max_x = min(min_x, x_val), max(max_x, x_val) min_y, max_y = min(min_y, y_val), max(max_y, y_val) if square: min_x, min_y = min(min_x, min_y), min(min_x, min_y) max_x, max_y = max(max_x, max_y), max(max_x, max_y) x_diff = max_x - min_x y_diff = max_y - min_y plt.xlim(min_x - x_diff/6.0, max_x + x_diff/6.0) plt.ylim(min_y - y_diff/6.0, max_y + y_diff/6.0) # X and Y labels plt.ylabel(stat_y, size='large') plt.xlabel(stat_x, size='large') plt.show()
def plot_box_and_hist(filename, xlabel, comparison_val): """ Makes a box plot and histogram of the data contained in the filename. Also draws in a line for a comparision value. For example, if plotting the at-home point spreads, a good comparison value would be 0.0. """ data = get_data(filename) average = np.mean(data) fig = plt.figure(figsize=(6, 4)) boxplot_axes = fig.add_axes([0.1, 0.7, 0.7, 0.15]) histplot_axes = fig.add_axes([0.1, 0.1, 0.7, 0.6]) histplot_axes.xaxis.label.set_fontsize(13) histplot_axes.yaxis.label.set_fontsize(13) for item in (histplot_axes.get_xticklabels() + histplot_axes.get_yticklabels()): item.set_fontsize(13) boxplot_axes.boxplot(data, notch=True, vert=False) boxplot_axes.plot([average, average], [0, 3], lw=3, color='g') boxplot_axes.plot( [comparison_val, comparison_val], [0, 3], lw=3, color='b') hist = histplot_axes.hist(data, bins=12, color='gray') maxval = max(hist[0]) histplot_axes.plot( [average, average], [0, maxval+maxval/6], lw=3, color='g' ) histplot_axes.plot( [comparison_val, comparison_val], [0, maxval+maxval/6], lw=3, color='b' ) histplot_axes.set_ylim(0, maxval+maxval/6) boxplot_axes.set_xticklabels([]) boxplot_axes.set_yticks([]) histplot_axes.set_yticks([]) plt.xlabel(xlabel, size='large') plt.show()
Both tests result with very large t-values, mainly because the sample size is so large (over 1500 games). I don't find this very interesting. """ def get_std(dat1, dat2): mean1 = np.mean(dat1) mean2 = np.mean(dat2) summed = (sum((x - mean1)**2 for x in dat1) + sum((x - mean2)**2 for x in dat2)) return summed / (len(dat1) + len(dat2) - 2.0) if __name__ == "__main__": home = get_data('data/home_ppg.dat') home_mean, home_n = np.mean(home), len(home) away = get_data('data/away_ppg.dat') away_mean, away_n = np.mean(away), len(away) std = get_std(home, away) print (home_mean - away_mean) / sqrt(std * (1.0/home_n + 1.0/away_n)) spread = [] for i in xrange(len(home)): spread.append(home[i] - away[i]) spread_mean = np.mean(spread) spread_std = np.std(spread) f = file('data/home_spread.dat', 'w') for val in spread: