Esempio n. 1
0
def test_fit_online_pendulum_with_sac():
    env = gym.make('Pendulum-v0')
    eval_env = gym.make('Pendulum-v0')

    algo = SAC()

    buffer = ReplayBuffer(1000, env)

    algo.fit_online(env,
                    buffer,
                    n_epochs=1,
                    eval_env=eval_env,
                    logdir='test_data',
                    tensorboard=False)
Esempio n. 2
0
def test_fit_online_pendulum_with_sac():
    env = gym.make("Pendulum-v0")
    eval_env = gym.make("Pendulum-v0")

    algo = SAC()

    buffer = ReplayBuffer(1000, env)

    algo.fit_online(
        env,
        buffer,
        n_steps=500,
        eval_env=eval_env,
        logdir="test_data",
    )
Esempio n. 3
0
def test_timelimit_aware(timelimit_aware):
    env = gym.make("Pendulum-v0")

    algo = SAC()

    buffer = ReplayBuffer(1000, env)

    algo.fit_online(
        env,
        buffer,
        n_steps=500,
        logdir="test_data",
        timelimit_aware=timelimit_aware,
    )

    terminal_count = 0
    for i in range(len(buffer)):
        terminal_count += int(buffer.transitions[i].terminal)

    if timelimit_aware:
        assert terminal_count == 0
    else:
        assert terminal_count > 0
Esempio n. 4
0
import gym

from d3rlpy.algos import SAC
from d3rlpy.online.buffers import ReplayBuffer

env = gym.make('Pendulum-v0')
eval_env = gym.make('Pendulum-v0')

# setup algorithm
sac = SAC(batch_size=100, use_gpu=False)

# replay buffer for experience replay
buffer = ReplayBuffer(maxlen=100000, env=env)

# start training
# probablistic policies does not need explorers
sac.fit_online(env,
               buffer,
               n_epochs=100,
               eval_env=eval_env,
               n_steps_per_epoch=1000,
               n_updates_per_epoch=100)
Esempio n. 5
0
import gym

from d3rlpy.algos import SAC
from d3rlpy.online.buffers import ReplayBuffer

env = gym.make('Pendulum-v0')
eval_env = gym.make('Pendulum-v0')

# setup algorithm
sac = SAC(batch_size=100, use_gpu=False)

# replay buffer for experience replay
buffer = ReplayBuffer(maxlen=100000, env=env)

# start training
# probablistic policies does not need explorers
sac.fit_online(env,
               buffer,
               n_steps=100000,
               eval_env=eval_env,
               n_steps_per_epoch=1000,
               update_start_step=1000)