def test_train_with_sac(): env = gym.make('Pendulum-v0') eval_env = gym.make('Pendulum-v0') algo = SAC(n_epochs=1) buffer = ReplayBuffer(1000, env) train(env, algo, buffer, eval_env=eval_env, logdir='test_data', tensorboard=False)
def test_train_with_dqn(): env = gym.make('CartPole-v0') eval_env = gym.make('CartPole-v0') algo = DQN(n_epochs=1) buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() train(env, algo, buffer, explorer, eval_env=eval_env, logdir='test_data', tensorboard=False)
def test_train_atari_with_dqn(): import d4rl_atari env = gym.make('breakout-mixed-v0', stack=False) eval_env = gym.make('breakout-mixed-v0', stack=False) algo = DQN(n_frames=4) buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() train(env, algo, buffer, explorer, n_steps=100, eval_env=eval_env, logdir='test_data', tensorboard=False) assert algo.impl.observation_shape == (4, 84, 84)
from d3rlpy.online.buffers import ReplayBuffer from d3rlpy.online.explorers import LinearDecayEpsilonGreedy from d3rlpy.online.iterators import train env = gym.make('CartPole-v0') eval_env = gym.make('CartPole-v0') # setup algorithm dqn = DQN(n_epochs=30, batch_size=32, learning_rate=2.5e-4, target_update_interval=100, use_gpu=False) # replay buffer for experience replay buffer = ReplayBuffer(maxlen=100000, env=env) # epilon-greedy explorer explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0, end_epsilon=0.1, duration=10000) # start training train(env, dqn, buffer, explorer, eval_env=eval_env, n_steps_per_epoch=1000, n_updates_per_epoch=100)
import gym from d3rlpy.algos import SAC from d3rlpy.online.buffers import ReplayBuffer from d3rlpy.online.iterators import train env = gym.make('Pendulum-v0') eval_env = gym.make('Pendulum-v0') # setup algorithm sac = SAC(n_epochs=100, batch_size=100, use_gpu=False) # replay buffer for experience replay buffer = ReplayBuffer(maxlen=100000, env=env) # start training # probablistic policies does not need explorers train(env, sac, buffer, eval_env=eval_env, n_steps_per_epoch=1000, n_updates_per_epoch=100)
# prepare dataset and environment dataset, env = get_pybullet('hopper-bullet-random-v0') _, eval_env = get_pybullet('hopper-bullet-random-v0') train_episodes, test_episodes = train_test_split(dataset) # setup algorithm awac = AWAC(n_epochs=30, encoder_params={'hidden_units': [256, 256, 256, 256]}, use_gpu=True) ## pretrain awac.fit(train_episodes[:10000], eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env), 'advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer }) # fine-tuning awac.n_epochs = 1000 buffer = ReplayBuffer(1000000, env, train_episodes[:10000]) train(env, awac, buffer, eval_env=eval_env, eval_epsilon=0.0, n_steps_per_epoch=1000, n_updates_per_epoch=1000)