def make_hist():
    """
    Makes a histogram of the at-home ppg data.
    The histogram contains lines for the at-home ppg average and
    for the total ppg average.
    """
    ppgs = get_data('data/home_ppg.dat')
    average = np.mean(ppgs)

    totalppgs = get_data('data/total_ppg.dat')
    totalppg_avg = np.mean(totalppgs)

    f, (ax1, ax2) = plt.subplots(2, sharex=True)
    ax1.hist(totalppgs, bins=12, color='gray')
    ax1.plot([totalppg_avg, totalppg_avg], [0, 700], lw=3, color='g')
    ax1.text(
        totalppg_avg+1, 620, "total mean=%1.3f" % totalppg_avg, size='large')
    ax1.text(-3, 520, "all games", rotation=90, size='large')
    ax1.set_yticks([])

    ax2.hist(ppgs, bins=12, color='gray')
    ax2.plot([average, average], [0, 350], lw=3, color='b')
    ax2.text(average+1, 300, "home mean=%1.3f" % average, size='large')
    ax2.text(-3, 270, "home games", rotation=90, size='large')

    plt.yticks([])
    plt.xlabel('points per game', size='large')

    plt.show()
def plot_data(years, stat, win_type):
    data = get_data('data/%s_top_%s.dat' % (years, stat))
    plt.figure(1)
    min_x, max_x = sys.maxint, 0
    min_y, max_y = sys.maxint, 0

    for datum in data:
        name, value, passes_filter, home_wins, total_wins = datum
        if win_type == 'home':
            wins = home_wins
        elif win_type == 'total':
            wins = total_wins
        else:
            print "Warning: invalid win_type. Defaulted to 'home'."
            wins = home_wins

        if passes_filter:
            color = 'k'
        else:
            color = 'r'
        plt.text(wins, value, name, size='large', color=color)
        min_x, max_x = min(min_x, wins), max(max_x, wins)
        min_y, max_y = min(min_y, value), max(max_y, value)

    plt.xlim(min_x - max_x/10.0, max_x + max_x/10.0)
    plt.ylim(min_y - max_y/10.0, max_y + max_y/10.0)

    # X and Y labels
    plt.ylabel(stat, size='large')
    plt.xlabel('# of %s wins' % win_type, size='large')

    plt.show()
def make_boxplot():
    """
    Makes three boxplots for home, away, and total ppgs.
    """
    home = get_data('data/home_ppg.dat')
    away = get_data('data/away_ppg.dat')
    total = get_data('data/total_ppg.dat')

    ax = plt.subplot(111)
    plt.boxplot([home, away, total], vert=False, widths=0.8)
    plt.yticks([])
    plt.text(-3, 3.3, "all games", rotation=90, size='large')
    plt.text(-3, 2.3, "away games", rotation=90, size='large')
    plt.text(-3, 1.3, "home games", rotation=90, size='large')
    plt.xlabel("points per game")
    plt.xlim(-1, 70)
    ax.xaxis.label.set_fontsize(20)
    plt.show()
def plot_data_comparison(years_x, stat_x, years_y, stat_y, square):
    data_x = get_data('data/%s_top_%s.dat' % (years_x, stat_x))
    data_y = get_data('data/%s_top_%s.dat' % (years_y, stat_y))
    plt.figure(1)
    min_x, max_x = sys.maxint, 0
    min_y, max_y = sys.maxint, 0

    data = {}

    # Links a team with their stat_x.
    for datum in data_x:
        name, value = datum[0], datum[1]
        data[name] = [value]

    # Links a team with their stat_y.    for datum in data_y:
    for datum in data_y:
        name, value = datum[0], datum[1]
        data[name].append(value)

    # Plot
    for team in data:
        x_val, y_val = data[team]
        plt.text(x_val, y_val, team, size='large')

        min_x, max_x = min(min_x, x_val), max(max_x, x_val)
        min_y, max_y = min(min_y, y_val), max(max_y, y_val)

    if square:
        min_x, min_y = min(min_x, min_y), min(min_x, min_y)
        max_x, max_y = max(max_x, max_y), max(max_x, max_y)

    x_diff = max_x - min_x
    y_diff = max_y - min_y
    plt.xlim(min_x - x_diff/6.0, max_x + x_diff/6.0)
    plt.ylim(min_y - y_diff/6.0, max_y + y_diff/6.0)

    # X and Y labels
    plt.ylabel(stat_y, size='large')
    plt.xlabel(stat_x, size='large')

    plt.show()
def plot_box_and_hist(filename, xlabel, comparison_val):
    """
    Makes a box plot and histogram of the data contained in the filename.
    Also draws in a line for a comparision value.

    For example, if plotting the at-home point spreads, a good comparison
    value would be 0.0.
    """
    data = get_data(filename)
    average = np.mean(data)

    fig = plt.figure(figsize=(6, 4))
    boxplot_axes = fig.add_axes([0.1, 0.7, 0.7, 0.15])
    histplot_axes = fig.add_axes([0.1, 0.1, 0.7, 0.6])
    histplot_axes.xaxis.label.set_fontsize(13)
    histplot_axes.yaxis.label.set_fontsize(13)
    for item in (histplot_axes.get_xticklabels() +
                 histplot_axes.get_yticklabels()):
        item.set_fontsize(13)

    boxplot_axes.boxplot(data, notch=True, vert=False)
    boxplot_axes.plot([average, average], [0, 3], lw=3, color='g')
    boxplot_axes.plot(
        [comparison_val, comparison_val], [0, 3], lw=3, color='b')

    hist = histplot_axes.hist(data, bins=12, color='gray')
    maxval = max(hist[0])
    histplot_axes.plot(
        [average, average],
        [0, maxval+maxval/6],
        lw=3,
        color='g'
    )
    histplot_axes.plot(
        [comparison_val, comparison_val],
        [0, maxval+maxval/6],
        lw=3,
        color='b'
    )

    histplot_axes.set_ylim(0, maxval+maxval/6)

    boxplot_axes.set_xticklabels([])
    boxplot_axes.set_yticks([])
    histplot_axes.set_yticks([])

    plt.xlabel(xlabel, size='large')

    plt.show()
Both tests result with very large t-values, mainly because the sample size
is so large (over 1500 games). I don't find this very interesting.
"""


def get_std(dat1, dat2):
    mean1 = np.mean(dat1)
    mean2 = np.mean(dat2)
    summed = (sum((x - mean1)**2 for x in dat1) +
              sum((x - mean2)**2 for x in dat2))
    return summed / (len(dat1) + len(dat2) - 2.0)


if __name__ == "__main__":
    home = get_data('data/home_ppg.dat')
    home_mean, home_n = np.mean(home), len(home)

    away = get_data('data/away_ppg.dat')
    away_mean, away_n = np.mean(away), len(away)

    std = get_std(home, away)
    print (home_mean - away_mean) / sqrt(std * (1.0/home_n + 1.0/away_n))

    spread = []
    for i in xrange(len(home)):
        spread.append(home[i] - away[i])
    spread_mean = np.mean(spread)
    spread_std = np.std(spread)
    f = file('data/home_spread.dat', 'w')
    for val in spread: