def test_training_TD_lambda_for_gridworld(model_class, train=True): gridworld = GridWorld() if train: policy, model = rl.train_reinforcement_strategy_temporal_difference_eligibility_trace( epochs=2000, game_obs=gridworld, model_class=model_class ) rl.test_policy(gridworld)
def test_training_monte_carlo_for_blackjack(model_class): blackjack = BlackJack() policy, model = rl.train_reinforcement_learning_strategy(num_sims=5000, game_obs=blackjack, model_class=model_class) df = pd.DataFrame(policy).T df.columns = ["player_value", "dealer_value", "decision", "score"] policy_Q_table = df.pivot("player_value", "dealer_value")["decision"] print policy_Q_table policy_Q_score = df.pivot("player_value", "dealer_value")["score"] print policy_Q_score # Add ipython notebook 3D ghaph # Test policy rl.test_policy(blackjack, model) return policy, model
def test_training_TD_for_blackjack(model_class): blackjack = BlackJack() policy, model = rl.train_reinforcement_strategy_temporal_difference( epochs=5000, game_obs=blackjack, model_class=model_class) df = pd.DataFrame(policy).T df.columns = ['player_value', 'dealer_value', 'decision', 'score'] policy_Q_table = df.pivot('player_value', 'dealer_value')['decision'] print policy_Q_table policy_Q_score = df.pivot('player_value', 'dealer_value')['score'] print policy_Q_score # Add ipython notebook 3D ghaph # Test policy rl.test_policy(blackjack) return policy, model
def test_training_TD_for_blackjack(model_class): blackjack = BlackJack() policy, model = rl.train_reinforcement_strategy_temporal_difference( epochs=5000, game_obs=blackjack, model_class=model_class ) df = pd.DataFrame(policy).T df.columns = ["player_value", "dealer_value", "decision", "score"] policy_Q_table = df.pivot("player_value", "dealer_value")["decision"] print policy_Q_table policy_Q_score = df.pivot("player_value", "dealer_value")["score"] print policy_Q_score # Add ipython notebook 3D ghaph # Test policy rl.test_policy(blackjack) return policy, model
def test_training_monte_carlo_for_blackjack(model_class): blackjack = BlackJack() policy, model = rl.train_reinforcement_learning_strategy( num_sims=5000, game_obs=blackjack, model_class=model_class) df = pd.DataFrame(policy).T df.columns = ['player_value', 'dealer_value', 'decision', 'score'] policy_Q_table = df.pivot('player_value', 'dealer_value')['decision'] print policy_Q_table policy_Q_score = df.pivot('player_value', 'dealer_value')['score'] print policy_Q_score # Add ipython notebook 3D ghaph # Test policy rl.test_policy(blackjack, model) return policy, model
def test_training_TD_lambda_for_gridworld(model_class, train=True): gridworld = GridWorld() if train: policy, model = rl.train_reinforcement_strategy_temporal_difference_eligibility_trace( epochs=2000, game_obs=gridworld, model_class=model_class) rl.test_policy(gridworld)