Ejemplo n.º 1
0
    def get_opponent_vector(self):

        database_session = MlbDatabase().open_session()
        # Get the hitters he is facing as well
        hitter_postgame_entries = database_session.query(
            PregameHitterGameEntry).filter(
                PregameHitterGameEntry.game_date == self.game_date,
                PregameHitterGameEntry.game_time == self.game_time,
                PregameHitterGameEntry.home_team == self.home_team,
                PregameHitterGameEntry.is_home_team != self.is_home_team)

        hitter_array = np.array(np.zeros(31))
        for hitter_entry in hitter_postgame_entries:
            hitter_array += hitter_entry.to_input_vector_raw()

        database_session.close()

        return PregameHitterGameEntry.avg_input_vector(hitter_array)
Ejemplo n.º 2
0
class PitcherRegressionForestTrainer(RegressionForest):

    SIZE_TRAINING_BATCH = 900

    def __init__(self):
        self._database_session = MlbDatabase().open_session()
        self._decision_tree = None

    def get_stochastic_batch(self, input_query, num_samples=None):
        potential_samples = list()
        for postgame_entry in input_query:
            pregame_entry = self._database_session.query(
                PregamePitcherGameEntry).get(
                    (postgame_entry.rotowire_id, postgame_entry.game_date,
                     postgame_entry.game_time))
            if pregame_entry is not None:
                potential_samples.append((pregame_entry, postgame_entry))
            else:
                print "Can't find %s %s %s %s" % (
                    postgame_entry.rotowire_id, postgame_entry.home_team,
                    postgame_entry.game_date, postgame_entry.game_time)
        if num_samples is None:
            num_samples = len(potential_samples)
        player_samples = random.sample([itm for itm in potential_samples],
                                       num_samples)
        x = list()
        y = list()
        for item in player_samples:
            pregame_entry = item[0]
            postgame_entry = item[1]
            input_vector = pregame_entry.to_input_vector()

            if pregame_entry.game_entry is None:
                print "NoneType game entry for %s %s %s %s" % (
                    pregame_entry.rotowire_id, pregame_entry.home_team,
                    pregame_entry.game_date, pregame_entry.game_time)
                continue

            if pregame_entry.game_entry.umpire is None:
                umpire_vector = UmpireCareerEntry.get_nominal_data(
                    self._database_session)
            else:
                ump_entry = self._database_session.query(
                    UmpireCareerEntry).get(pregame_entry.game_entry.umpire)

                if ump_entry is None:
                    umpire_vector = UmpireCareerEntry.get_nominal_data(
                        self._database_session)
                else:
                    umpire_vector = ump_entry.to_input_vector()

            game_datetime = datetime.datetime.strptime(pregame_entry.game_date,
                                                       "%Y-%m-%d")
            park_factors = self._database_session.query(ParkEntry).get(
                (pregame_entry.home_team, "2016"))
            if park_factors is None:
                print "Pitcher regression forest: Could not find %s from %s" % (
                    pregame_entry.home_team, "2016")
                park_vector = np.array([100, 100])
            else:
                park_vector = park_factors.to_input_vector()

            final_pitcher_array = np.concatenate([
                input_vector,
                pregame_entry.get_opponent_vector(), park_vector, umpire_vector
            ])
            x.append(final_pitcher_array.tolist())
            y.append([postgame_entry.actual_draftkings_points])

        return x, y

    def train_network(self):
        """ Pure virtual method for training the network
        """
        self.load_model()
        if self._decision_tree is None:
            self._decision_tree = RandomForestRegressor(n_estimators=1000)
            db_query = self._database_session.query(PostgamePitcherGameEntry)
            mlb_training_data, mlb_evaluation_data = self.get_train_eval_data(
                db_query, 0.8)
            x_train, y_train = self.get_stochastic_batch(mlb_training_data)
            self._decision_tree.fit(x_train, np.ravel(y_train))
            self.save_model()
            x_eval, y_eval = self.get_stochastic_batch(mlb_evaluation_data)
            y_eval_predictions = self._decision_tree.predict(x_eval)
            y_eval_predictions = np.array(y_eval_predictions)
            y_eval = np.array(y_eval)
            print "Pitcher Training Size: %i | Pitcher Evaluation Size: %i" % (
                len(x_train), len(x_eval))
            print "Pitcher median absolute error: %f" % median_absolute_error(
                y_eval, y_eval_predictions)
        self._database_session.close()

    def get_prediction(self, input_data):
        return self._decision_tree.predict([input_data])

    def get_prediction_interval(self, input_data, percentile=95):
        preds = []
        for pred in self._decision_tree.estimators_:
            preds.append(pred.predict(input_data.reshape(1, len(input_data))))
        err_down = np.percentile(preds, (100 - percentile) / 2.)
        err_up = np.percentile(preds, 100 - (100 - percentile) / 2.)
        return err_down, err_up

    def get_std_dev(self, input_data):
        preds = []
        for pred in self._decision_tree.estimators_:
            preds.append(pred.predict(input_data.reshape(1, len(input_data))))
        return np.std(preds)

    def save_model(self):
        try:
            joblib.dump(self._decision_tree, 'pitcher_regression_forest.pkl')
        except:
            pass

    def load_model(self):
        try:
            self._decision_tree = joblib.load('pitcher_regression_forest.pkl')
        except:
            pass
Ejemplo n.º 3
0
from sql.mlb_database import MlbDatabase
from sql.lineup import LineupEntry
from sql.postgame_hitter import PostgameHitterGameEntry
from sql.pregame_hitter import PregameHitterGameEntry
from sql.postgame_pitcher import PostgamePitcherGameEntry
from sql.pregame_pitcher import PregamePitcherGameEntry
from datetime import date, timedelta
from numpy import array, std, mean

database_session = MlbDatabase().open_session()

query_results = database_session.query(LineupEntry).filter(
    LineupEntry.game_date != date.today())
lineup_predicted_salary = 0
lineup_actual_salary = 0
lineup_actual_vector = list()
lineup_predicted_vector = list()
for query_result in query_results:
    try:
        #TODO: fix these gets by using the GameEntry to get the game time and such
        # TODO: this needs to be altered to accommodate double headers, but is not a big priority
        lineup_actual_salary += database_session.query(
            PostgameHitterGameEntry).get(
                (query_result.catcher,
                 query_result.game_date)).actual_draftkings_points
        lineup_actual_salary += database_session.query(
            PostgamePitcherGameEntry).get(
                (query_result.starting_pitcher_1,
                 query_result.game_date)).actual_draftkings_points
        lineup_actual_salary += database_session.query(
            PostgamePitcherGameEntry).get(