def test_replay_buffer_with_episode(maxlen, data_size): env = gym.make("CartPole-v0") observation_shape = env.observation_space.shape action_size = env.action_space.n observations = np.random.random((data_size, *observation_shape)) actions = np.random.randint(action_size, size=data_size, dtype=np.int32) rewards = np.random.random(data_size) episode = Episode( observation_shape=observation_shape, action_size=action_size, observations=observations.astype("f4"), actions=actions, rewards=rewards.astype("f4"), ) buffer = ReplayBuffer(maxlen, env, episodes=[episode]) # check episode initialization assert len(buffer) == data_size - 1 # check append_episode buffer.append_episode(episode) assert len(buffer) == 2 * (data_size - 1)
def train(params): # setup algorithm dqn = DQN(batch_size=params.get("batch_size"), learning_rate=params.get("learning_rate"), target_update_interval=params.get("target_update_interval"), q_func_factory=QRQFunctionFactory( n_quantiles=params.get("n_quantiles")), n_steps=params.get("train_freq"), gamma=params.get("gamma"), n_critics=1, target_reduction_type="min", use_gpu=True) # setup replay buffer buffer = ReplayBuffer(maxlen=params.get("buffer_size"), env=env) # setup explorers explorer = LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=params.get("exploration_final_eps"), duration=100000) # start training dqn.fit_online( env, buffer, n_steps=params.get("train_steps"), explorer= explorer, # you don't need this with probablistic policy algorithms tensorboard_dir=log_dir, eval_env=eval_env) dqn.save_model(exp_name)
def test_replay_buffer(n_episodes, batch_size, maxlen, gamma): env = gym.make('CartPole-v0') buffer = ReplayBuffer(maxlen, env, gamma) total_step = 0 for episode in range(n_episodes): observation, reward, terminal = env.reset(), 0.0, False while not terminal: action = env.action_space.sample() buffer.append(observation, action, reward, terminal) observation, reward, terminal, _ = env.step(action) total_step += 1 buffer.append(observation, action, reward, terminal) total_step += 1 assert len(buffer) == maxlen observation_shape = env.observation_space.shape batch = buffer.sample(batch_size) assert len(batch) == batch_size assert batch.observations.shape == (batch_size, ) + observation_shape assert batch.actions.shape == (batch_size, 1) assert batch.rewards.shape == (batch_size, 1) assert batch.next_observations.shape == (batch_size, ) + observation_shape assert batch.next_actions.shape == (batch_size, 1) assert batch.next_rewards.shape == (batch_size, 1) assert batch.terminals.shape == (batch_size, 1) assert len(batch.returns) == batch_size assert len(batch.consequent_observations) == batch_size
def test_replay_buffer(n_episodes, batch_size, maxlen): env = gym.make("CartPole-v0") buffer = ReplayBuffer(maxlen, env) total_step = 0 for episode in range(n_episodes): observation, reward, terminal = env.reset(), 0.0, False while not terminal: action = env.action_space.sample() buffer.append(observation.astype("f4"), action, reward, terminal) observation, reward, terminal, _ = env.step(action) total_step += 1 buffer.append(observation.astype("f4"), action, reward, terminal) total_step += 1 assert len(buffer) == maxlen observation_shape = env.observation_space.shape batch = buffer.sample(batch_size) assert len(batch) == batch_size assert batch.observations.shape == (batch_size,) + observation_shape assert batch.actions.shape == (batch_size,) assert batch.rewards.shape == (batch_size, 1) assert batch.next_observations.shape == (batch_size,) + observation_shape assert batch.next_actions.shape == (batch_size,) assert batch.next_rewards.shape == (batch_size, 1) assert batch.terminals.shape == (batch_size, 1) assert isinstance(batch.observations, np.ndarray) assert isinstance(batch.next_observations, np.ndarray)
def test_replay_buffer_with_clip_episode(n_episodes, batch_size, maxlen, clip_episode_flag): env = gym.make("CartPole-v0") buffer = ReplayBuffer(maxlen, env) observation, reward, terminal = env.reset(), 0.0, False clip_episode = False while not clip_episode: action = env.action_space.sample() observation, reward, terminal, _ = env.step(action) clip_episode = terminal if clip_episode_flag and terminal: terminal = False buffer.append( observation=observation.astype("f4"), action=action, reward=reward, terminal=terminal, clip_episode=clip_episode, ) # make a transition for a new episode for _ in range(2): buffer.append( observation=observation.astype("f4"), action=action, reward=reward, terminal=False, ) assert buffer.transitions[-2].terminal != clip_episode_flag assert buffer.transitions[-2].next_transition is None assert buffer.transitions[-1].prev_transition is None
def test_train_with_sac(): env = gym.make('Pendulum-v0') eval_env = gym.make('Pendulum-v0') algo = SAC(n_epochs=1) buffer = ReplayBuffer(1000, env) train(env, algo, buffer, eval_env=eval_env, logdir='test_data', tensorboard=False)
def test_fit_online_pendulum_with_sac(): env = gym.make('Pendulum-v0') eval_env = gym.make('Pendulum-v0') algo = SAC() buffer = ReplayBuffer(1000, env) algo.fit_online(env, buffer, n_epochs=1, eval_env=eval_env, logdir='test_data', tensorboard=False)
def test_fit_online_pendulum_with_sac(): env = gym.make("Pendulum-v0") eval_env = gym.make("Pendulum-v0") algo = SAC() buffer = ReplayBuffer(1000, env) algo.fit_online( env, buffer, n_steps=500, eval_env=eval_env, logdir="test_data", )
def test_replay_buffer(n_episodes, batch_size, maxlen, create_mask, mask_size): env = gym.make("CartPole-v0") buffer = ReplayBuffer(maxlen, env, create_mask=create_mask, mask_size=mask_size) total_step = 0 for episode in range(n_episodes): observation, reward, terminal = env.reset(), 0.0, False while not terminal: action = env.action_space.sample() buffer.append(observation.astype("f4"), action, reward, terminal) observation, reward, terminal, _ = env.step(action) total_step += 1 buffer.append(observation.astype("f4"), action, reward, terminal) total_step += 1 assert len(buffer) == maxlen # check static dataset conversion dataset = buffer.to_mdp_dataset() transitions = [] for episode in dataset: transitions += episode.transitions assert len(transitions) >= len(buffer) observation_shape = env.observation_space.shape batch = buffer.sample(batch_size) assert len(batch) == batch_size assert batch.observations.shape == (batch_size, ) + observation_shape assert batch.actions.shape == (batch_size, ) assert batch.rewards.shape == (batch_size, 1) assert batch.next_observations.shape == (batch_size, ) + observation_shape assert batch.next_actions.shape == (batch_size, ) assert batch.next_rewards.shape == (batch_size, 1) assert batch.terminals.shape == (batch_size, 1) assert isinstance(batch.observations, np.ndarray) assert isinstance(batch.next_observations, np.ndarray) if create_mask: assert batch.masks.shape == (mask_size, batch_size, 1) else: assert batch.masks is None
def test_train_with_dqn(): env = gym.make('CartPole-v0') eval_env = gym.make('CartPole-v0') algo = DQN(n_epochs=1) buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() train(env, algo, buffer, explorer, eval_env=eval_env, logdir='test_data', tensorboard=False)
def test_fit_online_cartpole_with_dqn(): env = gym.make('CartPole-v0') eval_env = gym.make('CartPole-v0') algo = DQN() buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() algo.fit_online(env, buffer, explorer, n_epochs=1, eval_env=eval_env, logdir='test_data', tensorboard=False)
def test_fit_online_cartpole_with_dqn(): env = gym.make("CartPole-v0") eval_env = gym.make("CartPole-v0") algo = DQN() buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() algo.fit_online( env, buffer, explorer, n_steps=100, eval_env=eval_env, logdir="test_data", )
def test_train_atari_with_dqn(): import d4rl_atari env = gym.make('breakout-mixed-v0', stack=False) eval_env = gym.make('breakout-mixed-v0', stack=False) algo = DQN(n_frames=4) buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() train(env, algo, buffer, explorer, n_steps=100, eval_env=eval_env, logdir='test_data', tensorboard=False) assert algo.impl.observation_shape == (4, 84, 84)
def test_fit_online_atari_with_dqn(): import d4rl_atari env = ChannelFirst(DummyAtari()) eval_env = ChannelFirst(DummyAtari()) algo = DQN(n_frames=4) buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() algo.fit_online( env, buffer, explorer, n_steps=100, eval_env=eval_env, logdir="test_data", ) assert algo.impl.observation_shape == (4, 84, 84)
def test_fit_online_atari_with_dqn(): import d4rl_atari env = ChannelFirst(gym.make("breakout-mixed-v0")) eval_env = ChannelFirst(gym.make("breakout-mixed-v0")) algo = DQN(n_frames=4) buffer = ReplayBuffer(1000, env) explorer = LinearDecayEpsilonGreedy() algo.fit_online( env, buffer, explorer, n_steps=100, eval_env=eval_env, logdir="test_data", tensorboard=False, ) assert algo.impl.observation_shape == (4, 84, 84)
def test_timelimit_aware(timelimit_aware): env = gym.make("Pendulum-v0") algo = SAC() buffer = ReplayBuffer(1000, env) algo.fit_online( env, buffer, n_steps=500, logdir="test_data", timelimit_aware=timelimit_aware, ) terminal_count = 0 for i in range(len(buffer)): terminal_count += int(buffer.transitions[i].terminal) if timelimit_aware: assert terminal_count == 0 else: assert terminal_count > 0
stack=False, clip_reward=False, terminate_on_life_loss=False) # setup algorithm dqn = DoubleDQN(batch_size=32, learning_rate=2.5e-4, optim_factory=AdamFactory(eps=1e-2 / 32), target_update_interval=10000, q_func_factory='mean', scaler='pixel', n_frames=4, use_gpu=True) # replay buffer for experience replay buffer = ReplayBuffer(maxlen=1000000, env=env) # epilon-greedy explorer explorer = LinearDecayEpsilonGreedy(start_epsilon=1.0, end_epsilon=0.1, duration=1000000) # start training dqn.fit_online(env, buffer, explorer, eval_env=eval_env, eval_epsilon=0.01, n_steps=50000000, n_steps_per_epoch=100000, update_interval=4,
def train(params): # setup algorithm if pretrain: dqn = DQN(batch_size=params.get("batch_size"), learning_rate=params.get("learning_rate"), target_update_interval=params.get("target_update_interval"), q_func_factory=QRQFunctionFactory( n_quantiles=params.get("n_quantiles")), n_steps=params.get("train_freq"), gamma=params.get("gamma"), n_critics=1, target_reduction_type="min", use_gpu=True) # setup replay buffer buffer = ReplayBuffer(maxlen=params.get("buffer_size"), env=env) # setup explorers explorer = LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=params.get("exploration_final_eps"), duration=100000) # start training dqn.fit_online( env, buffer, n_steps=params.get("train_steps"), explorer= explorer, # you don't need this with probablistic policy algorithms tensorboard_dir=log_dir, eval_env=eval_env) print("Saving Model") dqn.save_model(exp_name) print("convert buffer to dataset") dataset = buffer.to_mdp_dataset() # save MDPDataset dataset.dump('{0}.h5'.format(exp_name)) print("Loading Dataset for Offline Training") dataset = d3rlpy.dataset.MDPDataset.load('{0}.h5'.format(exp_name)) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) # The dataset can then be used to train a d3rlpy model cql = DiscreteCQL(learning_rate=6.25e-05, encoder_factory='default', q_func_factory='mean', batch_size=32, n_frames=1, n_steps=1, gamma=0.99, n_critics=1, bootstrap=False, share_encoder=False, target_reduction_type='min', target_update_interval=8000, use_gpu=True, scaler=None, augmentation=None, generator=None, impl=None) cql_exp = params.get("model_name") + "_offline_" + params.get( "environment") cql_log = '../../../logs/' + cql_exp cql.fit(dataset.episodes, eval_episodes=test_episodes, n_epochs=1000, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer, }, tensorboard_dir=cql_log) cql.save_model(cql_exp)
from sklearn.model_selection import train_test_split # prepare dataset and environment dataset, env = get_pybullet('hopper-bullet-random-v0') _, eval_env = get_pybullet('hopper-bullet-random-v0') train_episodes, test_episodes = train_test_split(dataset) # setup algorithm awac = AWAC(encoder_params={'hidden_units': [256, 256, 256, 256]}, use_gpu=True) ## pretrain awac.fit(train_episodes[:10000], eval_episodes=test_episodes, n_epochs=30, scorers={ 'environment': evaluate_on_environment(env), 'advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer }) # fine-tuning awac.fit_online(env, ReplayBuffer(1000000, env, train_episodes[:10000]), n_epochs=1000, eval_env=eval_env, eval_epsilon=0.0, n_steps_per_epoch=1000, n_updates_per_epoch=1000)