global_timestep.assign_add(1)
                episode_len += 1
                total_reward += reward
                state = next_state

                # for evaluation purpose
                if global_timestep.numpy() % agent.params.eval_interval == 0:
                    agent.eval_flg = True

            """
            ===== After 1 Episode is Done =====
            """

            # train the model at this point
            for t_train in range(episode_len):  # in mujoco, this will be 1,000 iterations!
                states, actions, rewards, next_states, dones = replay_buffer.sample(agent.params.batch_size)
                loss = agent.update(states, actions, rewards, next_states, dones)
                soft_target_model_update_eager(agent.target_actor, agent.actor, tau=agent.params.soft_update_tau)
                soft_target_model_update_eager(agent.target_critic, agent.critic, tau=agent.params.soft_update_tau)

            tf.contrib.summary.scalar("reward", total_reward, step=i)
            tf.contrib.summary.scalar("exec time", time.time() - start, step=i)
            if i >= agent.params.reward_buffer_ep:
                tf.contrib.summary.scalar("Moving Ave Reward", np.mean(reward_buffer), step=i)

            # store the episode reward
            reward_buffer.append(total_reward)
            time_buffer.append(time.time() - start)

            if global_timestep.numpy() > agent.params.learning_start and i % agent.params.reward_buffer_ep == 0:
                log.logging(global_timestep.numpy(), i, np.sum(time_buffer), reward_buffer, np.mean(loss), 0, [0])
Beispiel #2
0
from tf_rl.common.memory import ReplayBuffer
from tf_rl.common.wrappers import wrap_deepmind, make_atari

size = 1000

env = wrap_deepmind(make_atari("PongNoFrameskip-v4"))
memory = ReplayBuffer(size, n_step=5, flg_seq=True)

print("Memory contains {0} timesteps".format(len(memory)))

state = env.reset()
action = env.action_space.sample()
next_state, reward, done, info = env.step(action)
env.close()

for _ in range(size):
    memory.add(state, action, reward, next_state, done)
print(len(memory))
memory.save()

print("Memory contains {0} timesteps".format(len(memory)))
states, actions, rewards, next_states, dones = memory.sample(batch_size=10)
print(states.shape, state.shape)

for _ in range(size):
    memory.sample(batch_size=10)
Beispiel #3
0
        expert_action = tf.argmax(expert_action, axis=-1)
        return expert_action

    for epoch in range(300):
        state = env.reset()
        done = False
        reward_ep = 0
        while not done:
            if epoch <= 1:
                action = env.action_space.sample()
            else:
                action = agent.select_action(state=state)
                action = np.squeeze(action).astype(np.int8)
            next_state, reward, done, info = env.step(action)
            buffer.add(state, action, reward, next_state, done)
            state = next_state
            reward_ep += reward
        reward_total.append(reward_ep)

        losses = list()
        for grad_step in range(10):
            states, _, _, _, _ = buffer.sample(batch_size=32)
            expert_action = ask_expert(states)
            loss = agent.update(states, expert_action)
            losses.append(loss.numpy())

        print("Ep: {} Reward: {} MAR: {:.4f} Loss: {:.4f}".format(
            epoch, reward_ep, np.mean(reward_total), np.mean(losses)))

    env.close()
Beispiel #4
0
    memory_tf.add(state, action, reward, next_state, done)
    state = next_state
env.close()

print("=== test ===")
"""
Note:
    I have conducted the performance test where we repeat sampling from the Replay Buffer over 1000 times.
    And measured the exec time to compare Eager and Eager with Tf.function.

Result:
    without function: 9.03s
    with function:    1.13s
"""
import time

begin = time.time()
for _ in range(1000):
    memory_tf.sample_tf(batch_size=10)
print("with tf.function took : {:3f}s".format(time.time() - begin))

begin = time.time()
for _ in range(1000):
    memory_tf.sample(batch_size=10)
print("w/o tf. function took : {:3f}s".format(time.time() - begin))

begin = time.time()
for _ in range(1000):
    memory.sample(batch_size=10)
print("original memory took : {:3f}s".format(time.time() - begin))
Beispiel #5
0
from tf_rl.common.memory import ReplayBuffer
from tf_rl.common.wrappers import wrap_deepmind, make_atari

env = wrap_deepmind(make_atari("PongNoFrameskip-v4"))
memory = ReplayBuffer(1000, n_step=5, flg_seq=True)

print("Memory contains {0} timesteps".format(len(memory)))

for i in range(1):
    state = env.reset()
    for t in range(1000):
        # env.render()
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        memory.add(state, action, reward, next_state, done)
        state = next_state

        if done:
            print("Episode finished after {} timesteps".format(t + 1))
            print("Memory contains {0} timesteps".format(len(memory)))
            break

env.close()

print("Memory contains {0} timesteps".format(len(memory)))
state, action, reward, next_state, done = memory.sample(batch_size=10)
print(state.shape, action.shape)