Ejemplo n.º 1
0
    def rate(self, unstacked):
        """
        Run a Markov Chain Monte Carlo (MCMC) simulation on the defined
        directed graphical model (aka Bayesian Network).

        References
        ----------
        TODO

        Parameters
        ----------
        unstacked : dataframe
            Unstacked dataframe containing game and stat information.

        Returns
        -------
        TODO
        """
        util.validate_games(unstacked, ['poss'])
        assert (self.burn_rate >= 0) and (self.burn_rate <= 1), \
            "burn rate must be between 0 and 1, but was %s" % self.burn_rate

        unstacked = unstacked.sort('dt')
        teams = Pace._get_teams(unstacked)

        num_teams = teams.shape[0]
        home_team_idx = unstacked.i_hteam.values
        away_team_idx = unstacked.i_ateam.values
        observed_pace = unstacked.poss.values
        pace_initial = self._initial_guess()
        # tau = 1. / pymc.Uniform('sigma', 3, 20)**2
        tau = pymc.Uniform('tau', 1. / 40**2, 1. / 20**2)
        pace_prior = pymc.Normal("pace_prior", mu=0, tau=tau, size=num_teams, value=pace_initial)
        pace_intercept = pymc.Normal('intercept', 66, 1 / 1**2, value=66)

        @pymc.deterministic
        def pace_rtg(pace=pace_prior):
            p = pace.copy()
            p = p - np.mean(pace)
            return p

        @pymc.deterministic
        def mu_pace(home_team=home_team_idx, away_team=away_team_idx,
                    paces=pace_rtg, pace_intercept=pace_intercept):
            return pace_intercept + paces[home_team] + paces[away_team]

        tau_poss = 1. / pymc.Uniform('sigma_poss', 1., 10.)**2
        poss = pymc.Normal('poss', mu=mu_pace, tau=tau_poss, value=observed_pace, observed=True)
        poss_pred = pymc.Normal('poss_pred', mu=mu_pace, tau=tau_poss)
        model = pymc.Model([mu_pace, pace_prior, tau, pace_rtg, poss, pace_intercept, tau_poss, poss_pred])
        # map_ = pymc.MAP(model)
        # map_.fit(method='fmin_powell')
        mcmc = pymc.MCMC(model)
        mcmc.sample(self.n_samples, self.n_samples * self.burn_rate)

        return model, mcmc
Ejemplo n.º 2
0
    def rate(self, unstacked):
        """
        Run an adjusted stat model rating for the games data provided.

        By default, provides incremental ratings throughout the season, running
        the rating algorithm every `game_skip` games. For example, if `game_skip`
        is 1, then the algorithm provides update ratings after each game played
        during the season, and there will be `num_games` sets of ratings. The run
        time gets progressively slower as the data included grows throughout the
        season.

        Stats are adjusted according to:
            adj = \sum raw_stat / adj_opp_stat * avg_stat * w_i * \\
                  loc_i + w_pre * stat_pre

        References
        ----------
        -Kenpom's own ratings explanation
            http://kenpom.com/blog/index.php/weblog/entry/ratings_explanation
        -Kenpom's explanation of margin of victory adjustment:
            http://kenpom.com/blog/index.php/weblog/entry/pomeroy_ratings_version_2.0
        -Kenpom's adjusted stats calculations explanation:
            http://kenpom.com/blog/index.php/weblog/entry/national_efficiency/

        Parameters
        ----------
        unstacked : dataframe
            Unstacked dataframe containing game and stat information.

        Returns
        -------
        unstacked : dataframe
            Original unstacked dataframe with ratings columns appended.
        """
        util.validate_games(unstacked, ['pts', 'poss', 'ppp'])
        if AdjustedStat._is_multiple_seasons(unstacked):
            return self._rate_multiple(unstacked)

        # need games to be in sequential order
        unstacked = unstacked.sort('dt')
        teams, team_index = AdjustedStat._get_teams(unstacked)
        num_teams = teams.shape[0]
        num_games = unstacked.shape[0]
        unstacked = AdjustedStat._add_team_index(unstacked, team_index)

        idx, loc, oraw, draw = self._initialize(unstacked, teams)
        o_pre, d_pre = self._preseason_rank(teams)

        # Add the preseason rank as a starting point
        adj_o_history = [o_pre]
        adj_d_history = [d_pre]

        game_indices = unstacked[['i_hteam', 'i_ateam']].values
        current_index = {team: 0 for team in xrange(num_teams)}
        dates = unstacked['dt'].values
        games_included = [0]
        zero_summary = AdjustedStat._empty_iteration_summary(date=dates[0])

        cumulative_home_o = np.zeros(num_games)
        cumulative_home_d = np.zeros(num_games)
        cumulative_away_o = np.zeros(num_games)
        cumulative_away_d = np.zeros(num_games)

        results = [zero_summary]
        prev_idx = 0
        for gidx, (hidx, aidx) in enumerate(game_indices):
            # increment team vector indices to include new game
            current_index[hidx] += 1
            current_index[aidx] += 1
            if not self._should_rate(gidx, num_games):
                continue

            if self.verbose:
                print 'No. of games included: %s' % gidx

            avg_o, avg_d = self._average_stats(oraw, draw, current_index)
            if gidx == 0:
                adj_o, adj_d = self._initial_guess(unstacked, teams, gidx)
            else:
                # the initial guess is simply the ratings from the previous iteration
                # TODO: some weird convergence issues for this method
                adj_o = adj_o.copy()
                adj_d = adj_d.copy()

            adj_o, adj_d, iter_results = \
                self._rate_one(oraw, draw, avg_o, avg_d, loc, idx, current_index,
                               o_pre, d_pre, start_o=adj_o, start_d=adj_d)
            self._update_cumulative_ratings(cumulative_home_o, cumulative_home_d, cumulative_away_o, cumulative_away_d,
                                            adj_o, adj_d, game_indices[:, 0], game_indices[:, 1], gidx, prev_idx)
            adj_o_history.append(adj_o.copy())
            adj_d_history.append(adj_d.copy())
            results.append(iter_results)
            games_included.append(gidx + 1)
            prev_idx = gidx

        # add a rating column to include ratings for each game in the dataframe
        unstacked = AdjustedStat._rate_for_games(unstacked, games_included, adj_o_history, adj_d_history, self.stat)
        self.offensive_ratings = np.array(adj_o_history)
        self.defensive_ratings = np.array(adj_d_history)
        self.results = results
        self.team_index = team_index
        return unstacked