예제 #1
0
mdp = GameMDP(game)
env = Environment(mdp)
qlearning = QLearning(env=env,
                      qfunction=TabularVF(random_state=seed),
                      policy=RandomPolicy(action_space=env.action_space,
                                          random_state=seed),
                      learning_rate=0.1,
                      discount_factor=1.0,
                      selfplay=True)


class Monitor(Callback):
    def on_episode_begin(self, episode, qfunction):
        if episode % 100 == 0:
            print('Episode {}'.format(episode))


qlearning.train(
    n_episodes=70000,
    callbacks=[
        Monitor(),
        EpisodicWLDPlotter(
            game=game,
            opp_player=RandPlayer(random_state=seed),
            n_matches=1000,
            period=1000,
            filepath=
            '../mlnd-capstone-report/figures/tic_ql_tab_full_selfplay_wld_plot.pdf'
        )
    ])
예제 #2
0
        mdp = FixedGameMDP(get_random_game(), RandPlayer(random_state=seed), 1)
        env = Environment(mdp)
        qlearning.env = env
        egreedy.action_space = env.actions
        qlearning.policy.provider = env.actions
        if episode % 50 == 0:
            print('Episode {}'.format(episode))


# prepopulate replay memory? read deepmind paper

qlearning.train(n_episodes=15000,
                callbacks=[
                    EpisodicWLDPlotter(
                        game=get_random_loss_game,
                        opp_player=RandPlayer(random_state=seed),
                        n_matches=1000,
                        period=250,
                        filepath='figures/c4dn_uci_losses.pdf'),
                    LinearAnnealing(egreedy,
                                    'epsilon',
                                    init=1.0,
                                    final=0.1,
                                    n_episodes=5000),
                    Monitor()
                ])

# re run experiment with wins to use get_random_game

# consolidate experiment in one file
        qlearning.policy.provider = env.action_space
        if episode % 50 == 0:
            print('Episode {}'.format(episode))


# prepopulate replay memory? read deepmind paper

period = 500
n_matches = 1000

qlearning.train(
    n_episodes=15000,
    callbacks=[
        EpisodicWLDPlotter(game=get_random_win_game,
                           opp_player=RandPlayer(random_state=seed),
                           n_matches=n_matches,
                           period=period,
                           filepath='figures/c4dn_uci_wins.pdf'),
        EpisodicWLDPlotter(game=get_random_draw_game,
                           opp_player=RandPlayer(random_state=seed),
                           n_matches=n_matches,
                           period=period,
                           filepath='figures/c4dn_uci_draws.pdf'),
        EpisodicWLDPlotter(game=get_random_loss_game,
                           opp_player=RandPlayer(random_state=seed),
                           n_matches=n_matches,
                           period=period,
                           filepath='figures/c4dn_uci_losses.pdf'),
        LinearAnnealing(egreedy,
                        'epsilon',
                        init=1.0,
예제 #4
0
                                 selfplay=True,
                                 experience_replay=True,
                                 replay_memory_size=10000,
                                 batch_size=32)


class Monitor(Callback):
    def on_episode_begin(self, episode, qfunction):
        if episode % 50 == 0:
            print('Episode {}'.format(episode))


qlearning.train(
    n_episodes=1750,
    callbacks=[
        EpisodicWLDPlotter(game=game,
                           opp_player=RandPlayer(),
                           n_matches=1000,
                           period=250,
                           filepath='figures/c4_dqn_simple.pdf'),
        # LinearAnnealing(egreedy, 'epsilon', init=1.0, final=0.1, n_episodes=1000),
        Monitor()
    ])

from capstone.game.players import GreedyQ

g = GreedyQ(qnetwork)
print 'Move:', g.choose_move(game)

# IMPORTANT: dont forget to filter the best value, ignore the ilegal moves
예제 #5
0
        qlearning.env = env
        egreedy.action_space = env.actions
        qlearning.policy.provider = env.actions
        if episode % 50 == 0:
            print('Episode {}'.format(episode))


# prepopulate replay memory? read deepmind paper

qlearning.train(
    n_episodes=5000,
    callbacks=[
        EpisodicWLDPlotter(
            # game=get_random_game,
            game=get_random_win_game,
            # game=game,
            opp_player=RandPlayer(random_state=seed),
            n_matches=1000,
            period=250,
            filepath='figures/c4_dqn_uci.pdf'),
        LinearAnnealing(egreedy,
                        'epsilon',
                        init=1.0,
                        final=0.1,
                        n_episodes=10000),
        Monitor()
    ])

# got 90% with 42 input units, 3 hidden layers, 7 output units, 400 hidden units, lr=0.001, no
# selfplay, only a 1,000 experience replay size, 100000 episodes, and linear annealing for 10,000
# episodes only
예제 #6
0
from capstone.rl.utils import EpisodicWLDPlotter, Callback, LinearAnnealing
from capstone.rl.value_functions import MLP, QNetwork

# game = Connect4()
game = TicTacToe()
# mdp = GameMDP(game)
mdp = FixedGameMDP(game, RandPlayer(), 1)
env = Environment(mdp)
# qnetwork = QNetwork(n_input_units=42, n_output_units=7)
qnetwork = QNetwork(n_input_units=9,
                    n_hidden_layers=3,
                    n_output_units=9,
                    n_hidden_units=100)
# qnetwork = QNetwork(n_input_units=42, n_hidden_layers=3, n_output_units=7, n_hidden_units=100)
egreedy = EGreedy(env.actions, qnetwork, 1.0)
qlearning = ApproximateQLearning(
    env=env,
    qfunction=qnetwork,
    policy=EGreedy(env.actions, qnetwork, 0.3),
    discount_factor=0.99,  # change this to 1, and say because is deterministic
    n_episodes=100000,
    experience_replay=False)
qlearning.train(callbacks=[
    EpisodicWLDPlotter(game=game,
                       opp_player=RandPlayer(),
                       n_matches=500,
                       period=1000,
                       filepath='figures/c4_ql_mlp_fixed.pdf'),
    LinearAnnealing(egreedy, 'epsilon', init=1.0, final=0.1, n_episodes=50000)
])
from capstone.game.games import TicTacToe
from capstone.game.players import RandPlayer
from capstone.rl import Environment, GameMDP
from capstone.rl.learners import ApproxQLearningSelfPlay
from capstone.rl.policies import RandomPolicy
from capstone.rl.utils import EpisodicWLDPlotter
from capstone.rl.value_functions import MLP

seed = 23
game = TicTacToe()
mdp = GameMDP(game)
env = Environment(mdp)
mlp = MLP()
qlearning = ApproxQLearningSelfPlay(
    env=env,
    qfunction=MLP(),
    policy=RandomPolicy(env.actions, random_state=seed),
    discount_factor=0.99,
    n_episodes=100000,
    callbacks=[
        EpisodicWLDPlotter(
            game=game,
            opp_player=RandPlayer(random_state=seed),
            n_matches=100,
            period=1000,
            filepath='figures/tic_ql_mlp_selfplay_all.pdf'
        )
    ]
)
qlearning.train()
예제 #8
0
qnetwork = QNetwork(mapping,
                    n_input_units=9,
                    n_hidden_layers=1,
                    n_output_units=9,
                    n_hidden_units=100)
egreedy = EGreedy(env.actions, qnetwork, 1.0)
qlearning = ApproximateQLearning(env=env,
                                 qfunction=qnetwork,
                                 policy=egreedy,
                                 discount_factor=1.0,
                                 experience_replay=True,
                                 batch_size=32)
qlearning.train(n_episodes=10000,
                callbacks=[
                    EpisodicWLDPlotter(game=game,
                                       opp_player=RandPlayer(),
                                       n_matches=1000,
                                       period=250,
                                       filepath='figures/tic_deep_ql.pdf'),
                    LinearAnnealing(egreedy,
                                    'epsilon',
                                    init=1.0,
                                    final=0.1,
                                    n_episodes=5000)
                ])

# n_episodes = 4,000
# n_episodes_annearling = 2,000

# mention that I tried adam and rmsprop but they did not work
예제 #9
0
                                 experience_replay=True,
                                 replay_memory_size=10000,
                                 batch_size=32)


class Monitor(Callback):
    def on_episode_begin(self, episode, qfunction):
        if episode % 50 == 0:
            print('Episode {}'.format(episode))


qlearning.train(n_episodes=1200,
                callbacks=[
                    EpisodicWLDPlotter(
                        game=game,
                        opp_player=RandPlayer(random_state=seed),
                        n_matches=1000,
                        period=25,
                        filepath='figures/c4_dqn_easy_plot.pdf'),
                    LinearAnnealing(egreedy,
                                    'epsilon',
                                    init=1.0,
                                    final=0.1,
                                    n_episodes=1000),
                    Monitor()
                ])

from capstone.game.players import GreedyQ
g = GreedyQ(qnetwork)
print 'Move:', g.choose_move(game)

# IMPORTANT: dont forget to filter the best value, ignore the ilegal moves