mdp = GameMDP(game) env = Environment(mdp) qlearning = QLearning(env=env, qfunction=TabularVF(random_state=seed), policy=RandomPolicy(action_space=env.action_space, random_state=seed), learning_rate=0.1, discount_factor=1.0, selfplay=True) class Monitor(Callback): def on_episode_begin(self, episode, qfunction): if episode % 100 == 0: print('Episode {}'.format(episode)) qlearning.train( n_episodes=70000, callbacks=[ Monitor(), EpisodicWLDPlotter( game=game, opp_player=RandPlayer(random_state=seed), n_matches=1000, period=1000, filepath= '../mlnd-capstone-report/figures/tic_ql_tab_full_selfplay_wld_plot.pdf' ) ])
mdp = FixedGameMDP(get_random_game(), RandPlayer(random_state=seed), 1) env = Environment(mdp) qlearning.env = env egreedy.action_space = env.actions qlearning.policy.provider = env.actions if episode % 50 == 0: print('Episode {}'.format(episode)) # prepopulate replay memory? read deepmind paper qlearning.train(n_episodes=15000, callbacks=[ EpisodicWLDPlotter( game=get_random_loss_game, opp_player=RandPlayer(random_state=seed), n_matches=1000, period=250, filepath='figures/c4dn_uci_losses.pdf'), LinearAnnealing(egreedy, 'epsilon', init=1.0, final=0.1, n_episodes=5000), Monitor() ]) # re run experiment with wins to use get_random_game # consolidate experiment in one file
qlearning.policy.provider = env.action_space if episode % 50 == 0: print('Episode {}'.format(episode)) # prepopulate replay memory? read deepmind paper period = 500 n_matches = 1000 qlearning.train( n_episodes=15000, callbacks=[ EpisodicWLDPlotter(game=get_random_win_game, opp_player=RandPlayer(random_state=seed), n_matches=n_matches, period=period, filepath='figures/c4dn_uci_wins.pdf'), EpisodicWLDPlotter(game=get_random_draw_game, opp_player=RandPlayer(random_state=seed), n_matches=n_matches, period=period, filepath='figures/c4dn_uci_draws.pdf'), EpisodicWLDPlotter(game=get_random_loss_game, opp_player=RandPlayer(random_state=seed), n_matches=n_matches, period=period, filepath='figures/c4dn_uci_losses.pdf'), LinearAnnealing(egreedy, 'epsilon', init=1.0,
selfplay=True, experience_replay=True, replay_memory_size=10000, batch_size=32) class Monitor(Callback): def on_episode_begin(self, episode, qfunction): if episode % 50 == 0: print('Episode {}'.format(episode)) qlearning.train( n_episodes=1750, callbacks=[ EpisodicWLDPlotter(game=game, opp_player=RandPlayer(), n_matches=1000, period=250, filepath='figures/c4_dqn_simple.pdf'), # LinearAnnealing(egreedy, 'epsilon', init=1.0, final=0.1, n_episodes=1000), Monitor() ]) from capstone.game.players import GreedyQ g = GreedyQ(qnetwork) print 'Move:', g.choose_move(game) # IMPORTANT: dont forget to filter the best value, ignore the ilegal moves
qlearning.env = env egreedy.action_space = env.actions qlearning.policy.provider = env.actions if episode % 50 == 0: print('Episode {}'.format(episode)) # prepopulate replay memory? read deepmind paper qlearning.train( n_episodes=5000, callbacks=[ EpisodicWLDPlotter( # game=get_random_game, game=get_random_win_game, # game=game, opp_player=RandPlayer(random_state=seed), n_matches=1000, period=250, filepath='figures/c4_dqn_uci.pdf'), LinearAnnealing(egreedy, 'epsilon', init=1.0, final=0.1, n_episodes=10000), Monitor() ]) # got 90% with 42 input units, 3 hidden layers, 7 output units, 400 hidden units, lr=0.001, no # selfplay, only a 1,000 experience replay size, 100000 episodes, and linear annealing for 10,000 # episodes only
from capstone.rl.utils import EpisodicWLDPlotter, Callback, LinearAnnealing from capstone.rl.value_functions import MLP, QNetwork # game = Connect4() game = TicTacToe() # mdp = GameMDP(game) mdp = FixedGameMDP(game, RandPlayer(), 1) env = Environment(mdp) # qnetwork = QNetwork(n_input_units=42, n_output_units=7) qnetwork = QNetwork(n_input_units=9, n_hidden_layers=3, n_output_units=9, n_hidden_units=100) # qnetwork = QNetwork(n_input_units=42, n_hidden_layers=3, n_output_units=7, n_hidden_units=100) egreedy = EGreedy(env.actions, qnetwork, 1.0) qlearning = ApproximateQLearning( env=env, qfunction=qnetwork, policy=EGreedy(env.actions, qnetwork, 0.3), discount_factor=0.99, # change this to 1, and say because is deterministic n_episodes=100000, experience_replay=False) qlearning.train(callbacks=[ EpisodicWLDPlotter(game=game, opp_player=RandPlayer(), n_matches=500, period=1000, filepath='figures/c4_ql_mlp_fixed.pdf'), LinearAnnealing(egreedy, 'epsilon', init=1.0, final=0.1, n_episodes=50000) ])
from capstone.game.games import TicTacToe from capstone.game.players import RandPlayer from capstone.rl import Environment, GameMDP from capstone.rl.learners import ApproxQLearningSelfPlay from capstone.rl.policies import RandomPolicy from capstone.rl.utils import EpisodicWLDPlotter from capstone.rl.value_functions import MLP seed = 23 game = TicTacToe() mdp = GameMDP(game) env = Environment(mdp) mlp = MLP() qlearning = ApproxQLearningSelfPlay( env=env, qfunction=MLP(), policy=RandomPolicy(env.actions, random_state=seed), discount_factor=0.99, n_episodes=100000, callbacks=[ EpisodicWLDPlotter( game=game, opp_player=RandPlayer(random_state=seed), n_matches=100, period=1000, filepath='figures/tic_ql_mlp_selfplay_all.pdf' ) ] ) qlearning.train()
qnetwork = QNetwork(mapping, n_input_units=9, n_hidden_layers=1, n_output_units=9, n_hidden_units=100) egreedy = EGreedy(env.actions, qnetwork, 1.0) qlearning = ApproximateQLearning(env=env, qfunction=qnetwork, policy=egreedy, discount_factor=1.0, experience_replay=True, batch_size=32) qlearning.train(n_episodes=10000, callbacks=[ EpisodicWLDPlotter(game=game, opp_player=RandPlayer(), n_matches=1000, period=250, filepath='figures/tic_deep_ql.pdf'), LinearAnnealing(egreedy, 'epsilon', init=1.0, final=0.1, n_episodes=5000) ]) # n_episodes = 4,000 # n_episodes_annearling = 2,000 # mention that I tried adam and rmsprop but they did not work
experience_replay=True, replay_memory_size=10000, batch_size=32) class Monitor(Callback): def on_episode_begin(self, episode, qfunction): if episode % 50 == 0: print('Episode {}'.format(episode)) qlearning.train(n_episodes=1200, callbacks=[ EpisodicWLDPlotter( game=game, opp_player=RandPlayer(random_state=seed), n_matches=1000, period=25, filepath='figures/c4_dqn_easy_plot.pdf'), LinearAnnealing(egreedy, 'epsilon', init=1.0, final=0.1, n_episodes=1000), Monitor() ]) from capstone.game.players import GreedyQ g = GreedyQ(qnetwork) print 'Move:', g.choose_move(game) # IMPORTANT: dont forget to filter the best value, ignore the ilegal moves