global_timestep.assign_add(1) episode_len += 1 total_reward += reward state = next_state # for evaluation purpose if global_timestep.numpy() % agent.params.eval_interval == 0: agent.eval_flg = True """ ===== After 1 Episode is Done ===== """ # train the model at this point for t_train in range(episode_len): # in mujoco, this will be 1,000 iterations! states, actions, rewards, next_states, dones = replay_buffer.sample(agent.params.batch_size) loss = agent.update(states, actions, rewards, next_states, dones) soft_target_model_update_eager(agent.target_actor, agent.actor, tau=agent.params.soft_update_tau) soft_target_model_update_eager(agent.target_critic, agent.critic, tau=agent.params.soft_update_tau) tf.contrib.summary.scalar("reward", total_reward, step=i) tf.contrib.summary.scalar("exec time", time.time() - start, step=i) if i >= agent.params.reward_buffer_ep: tf.contrib.summary.scalar("Moving Ave Reward", np.mean(reward_buffer), step=i) # store the episode reward reward_buffer.append(total_reward) time_buffer.append(time.time() - start) if global_timestep.numpy() > agent.params.learning_start and i % agent.params.reward_buffer_ep == 0: log.logging(global_timestep.numpy(), i, np.sum(time_buffer), reward_buffer, np.mean(loss), 0, [0])
from tf_rl.common.memory import ReplayBuffer from tf_rl.common.wrappers import wrap_deepmind, make_atari size = 1000 env = wrap_deepmind(make_atari("PongNoFrameskip-v4")) memory = ReplayBuffer(size, n_step=5, flg_seq=True) print("Memory contains {0} timesteps".format(len(memory))) state = env.reset() action = env.action_space.sample() next_state, reward, done, info = env.step(action) env.close() for _ in range(size): memory.add(state, action, reward, next_state, done) print(len(memory)) memory.save() print("Memory contains {0} timesteps".format(len(memory))) states, actions, rewards, next_states, dones = memory.sample(batch_size=10) print(states.shape, state.shape) for _ in range(size): memory.sample(batch_size=10)
expert_action = tf.argmax(expert_action, axis=-1) return expert_action for epoch in range(300): state = env.reset() done = False reward_ep = 0 while not done: if epoch <= 1: action = env.action_space.sample() else: action = agent.select_action(state=state) action = np.squeeze(action).astype(np.int8) next_state, reward, done, info = env.step(action) buffer.add(state, action, reward, next_state, done) state = next_state reward_ep += reward reward_total.append(reward_ep) losses = list() for grad_step in range(10): states, _, _, _, _ = buffer.sample(batch_size=32) expert_action = ask_expert(states) loss = agent.update(states, expert_action) losses.append(loss.numpy()) print("Ep: {} Reward: {} MAR: {:.4f} Loss: {:.4f}".format( epoch, reward_ep, np.mean(reward_total), np.mean(losses))) env.close()
memory_tf.add(state, action, reward, next_state, done) state = next_state env.close() print("=== test ===") """ Note: I have conducted the performance test where we repeat sampling from the Replay Buffer over 1000 times. And measured the exec time to compare Eager and Eager with Tf.function. Result: without function: 9.03s with function: 1.13s """ import time begin = time.time() for _ in range(1000): memory_tf.sample_tf(batch_size=10) print("with tf.function took : {:3f}s".format(time.time() - begin)) begin = time.time() for _ in range(1000): memory_tf.sample(batch_size=10) print("w/o tf. function took : {:3f}s".format(time.time() - begin)) begin = time.time() for _ in range(1000): memory.sample(batch_size=10) print("original memory took : {:3f}s".format(time.time() - begin))
from tf_rl.common.memory import ReplayBuffer from tf_rl.common.wrappers import wrap_deepmind, make_atari env = wrap_deepmind(make_atari("PongNoFrameskip-v4")) memory = ReplayBuffer(1000, n_step=5, flg_seq=True) print("Memory contains {0} timesteps".format(len(memory))) for i in range(1): state = env.reset() for t in range(1000): # env.render() action = env.action_space.sample() next_state, reward, done, info = env.step(action) memory.add(state, action, reward, next_state, done) state = next_state if done: print("Episode finished after {} timesteps".format(t + 1)) print("Memory contains {0} timesteps".format(len(memory))) break env.close() print("Memory contains {0} timesteps".format(len(memory))) state, action, reward, next_state, done = memory.sample(batch_size=10) print(state.shape, action.shape)