Exemple #1
0
def train_eval(log_dir="DDPG",
               prev_log="",
               google_colab=False,
               seed=123,
               gpu_id=0,
               env_name="HalfCheetah-v2",
               num_frames=10000,
               tau=1e-2,
               memory_size=5000,
               hot_start=100,
               batch_size=200,
               interval_MAR=10,
               gamma=0.99,
               L2_reg=0.5,
               random_process="ou",
               mu=0.3,
               sigma=0.2,
               num_eval_episodes=1,
               eval_interval=1000):
    tf.compat.v1.set_random_seed(seed)
    np.random.seed(seed=seed)

    # prep for training
    log_dir = set_up_for_training(env_name=env_name,
                                  seed=seed,
                                  gpu_id=gpu_id,
                                  log_dir=log_dir,
                                  prev_log=prev_log,
                                  google_colab=google_colab)

    env = gym.make(env_name)
    env = Monitor(env=env, directory=log_dir["video_path"], force=True)

    replay_buffer = ReplayBuffer(memory_size, traj_dir=log_dir["traj_path"])
    reward_buffer = deque(maxlen=interval_MAR)
    summary_writer = tf.compat.v2.summary.create_file_writer(
        log_dir["summary_path"])

    if random_process == "ou":
        random_process = OrnsteinUhlenbeckProcess(
            size=env.action_space.shape[0], theta=0.15, mu=mu, sigma=sigma)
    elif random_process == "gaussian":
        random_process = GaussianNoise(mu=mu, sigma=sigma)
    else:
        random_process = False
        assert False, "choose the random process from either gaussian or ou"

    agent = DDPG(actor=Actor,
                 critic=Critic,
                 num_action=env.action_space.shape[0],
                 random_process=random_process,
                 gamma=gamma,
                 L2_reg=L2_reg,
                 actor_model_dir=log_dir["model_path"] + "/actor",
                 critic_model_dir=log_dir["model_path"] + "/critic")

    train(agent, env, replay_buffer, reward_buffer, summary_writer,
          num_eval_episodes, num_frames, tau, eval_interval, hot_start,
          batch_size, interval_MAR, log_dir, google_colab)
Exemple #2
0
def train_eval(log_dir_name,
               random_seed,
               env_name="CartPole",
               eps_start=1.0,
               eps_end=0.02,
               decay_steps=3000,
               optimizer=tf.keras.optimizers.RMSprop,
               learning_rate=0.00025,
               decay=0.95,
               momentum=0.0,
               epsilon=0.00001,
               centered=True,
               loss_fn=tf.compat.v1.losses.huber_loss,
               grad_clip_flg=None,
               num_frames=10000,
               train_freq=1,
               memory_size=5000,
               hot_start=100,
               sync_freq=1000,
               batch_size=32,
               interval_MAR=10,
               gamma=0.99,
               num_eval_episodes=1,
               eval_interval=1000):
    # init global time-step
    global_timestep = tf.compat.v1.train.create_global_step()

    # instantiate annealing funcs for ep and lr
    anneal_ep = tf.compat.v1.train.polynomial_decay(eps_start, global_timestep,
                                                    decay_steps, eps_end)

    # prep for training
    log_dir = set_up_for_training(log_dir_name=log_dir_name,
                                  env_name=env_name,
                                  seed=random_seed)
    env = prep_env(env_name=env_name, video_path=log_dir["video_path"])
    replay_buffer = ReplayBuffer(memory_size, traj_dir=log_dir["traj_path"])
    reward_buffer = deque(maxlen=interval_MAR)
    summary_writer = tf.compat.v2.summary.create_file_writer(
        log_dir["summary_path"])

    agent = Double_DQN(
        model=prep_model(env_name),
        policy=EpsilonGreedyPolicy_eager(dim_action=env.action_space.n,
                                         epsilon_fn=anneal_ep),
        optimizer=optimizer(learning_rate, decay, momentum, epsilon, centered),
        loss_fn=loss_fn,
        grad_clip_fn=gradient_clip_fn(flag=grad_clip_flg),
        num_action=env.action_space.n,
        model_dir=log_dir["model_path"],
        gamma=gamma,
        obs_prc_fn=prep_obs_processor(env_name))

    train(global_timestep, agent, env, replay_buffer, reward_buffer,
          summary_writer, num_eval_episodes, num_frames, eval_interval,
          hot_start, train_freq, batch_size, sync_freq, interval_MAR)
Exemple #3
0
def train_eval(log_dir="PytorchDQN",
               prev_log="",
               seed=123,
               gpu_id=0,
               env_name="Pong",
               eps_start=1.0,
               eps_end=0.01,
               learning_rate=1e-4,
               decay_rate=0.1,
               num_frames=1000000,
               train_freq=4,
               memory_size=10000,
               hot_start=10000,
               sync_freq=1000,
               batch_size=32,
               interval_MAR=100,
               gamma=0.99,
               num_eval_episodes=1,
               eval_interval=250000,
               cuda=True):
    # init global time-step
    global_timestep = 0

    # instantiate annealing funcs for ep
    anneal_ep = linear_schedule(int(num_frames * decay_rate), eps_end,
                                eps_start)

    # prep for training
    log_dir = set_up_for_training(env_name=env_name,
                                  seed=seed,
                                  gpu_id=gpu_id,
                                  log_dir=log_dir,
                                  prev_log=prev_log)
    env = prep_env(env_name=env_name, video_path=log_dir["video_path"])
    replay_buffer = ReplayBuffer(memory_size, traj_dir=log_dir["traj_path"])
    reward_buffer = deque(maxlen=interval_MAR)
    summary_writer = SummaryWriter(log_dir=log_dir["summary_path"])

    agent = dqn_agent(num_action=env.action_space.n,
                      policy=EpsilonGreedyPolicy_torch(
                          num_action=env.action_space.n, epsilon_fn=anneal_ep),
                      summary_writer=summary_writer,
                      learning_rate=learning_rate,
                      gamma=gamma,
                      model_path=log_dir["model_path"],
                      cuda=cuda)

    train(global_timestep, agent, env, replay_buffer, reward_buffer,
          summary_writer, num_eval_episodes, num_frames, eval_interval,
          hot_start, train_freq, batch_size, sync_freq, interval_MAR)
Exemple #4
0
mu = str(params.mu).split(".")
mu = str(mu[0] + mu[1])
params.log_dir = "../../logs/logs/DDPG_batchnorm-{}-seed{}/{}-mu{}".format(
    params.train_flg, params.seed, str(params.env_name.split("-")[0]), mu)
params.actor_model_dir = "../../logs/models/DDPG_batchnorm-{}-seed{}/{}/actor-mu{}/".format(
    params.train_flg, params.seed, str(params.env_name.split("-")[0]), mu)
params.critic_model_dir = "../../logs/models/DDPG_batchnorm-{}-seed{}/{}/critic-mu{}/".format(
    params.train_flg, params.seed, str(params.env_name.split("-")[0]), mu)
params.video_dir = "../../logs/video/DDPG_batchnorm-{}-seed{}/{}-mu{}/".format(
    params.train_flg, params.seed, str(params.env_name.split("-")[0]), mu)
params.plot_path = "../../logs/plots/DDPG_batchnorm-{}-seed{}/{}-mu{}/".format(
    params.train_flg, params.seed, str(params.env_name.split("-")[0]), mu)

env = gym.make(params.env_name)
env = Monitor(env, params.video_dir)

# set seed
env.seed(params.seed)
tf.random.set_random_seed(params.seed)

replay_buffer = ReplayBuffer(params.memory_size)
reward_buffer = deque(maxlen=params.reward_buffer_ep)
summary_writer = tf.contrib.summary.create_file_writer(params.log_dir)
random_process = OrnsteinUhlenbeckProcess(size=env.action_space.shape[0],
                                          theta=0.15,
                                          mu=params.mu,
                                          sigma=params.sigma)
# random_process = GaussianNoise(mu=params.mu, sigma=params.sigma)
agent = DDPG(Actor, Critic, env.action_space.shape[0], random_process, params)
train_DDPG_original(agent, env, replay_buffer, reward_buffer, summary_writer)
if params.debug_flg:
    params.log_dir = "../logs/logs/" + now.strftime("%Y%m%d-%H%M%S") + "-DDPG/"
    params.model_dir = "../logs/models/" + now.strftime("%Y%m%d-%H%M%S") + "-DDPG/"
else:
    params.log_dir = "../logs/logs/{}".format(params.env_name)
    params.model_dir = "../logs/models/{}".format(params.env_name)

env = gym.make(params.env_name)

# set seed
env.seed(params.seed)
tf.compat.v1.random.set_random_seed(params.seed)

agent = DDPG(Actor, Critic, env.action_space.shape[0], params)
replay_buffer = ReplayBuffer(params.memory_size)
reward_buffer = deque(maxlen=params.reward_buffer_ep)
summary_writer = tf.contrib.summary.create_file_writer(params.log_dir)

init_state = env.reset()  # reset
agent.predict(init_state)  # burn the format of the input matrix to get the weight matrices!!
gp_model, update = create_bayes_net()
optimiser = tf.compat.v1.train.AdamOptimizer()
num_sample = 100  # number of sampling

get_ready(agent.params)

global_timestep = tf.compat.v1.train.get_or_create_global_step()
time_buffer = deque(maxlen=agent.params.reward_buffer_ep)
log = logger(agent.params)
Exemple #6
0
from tf_rl.common.memory import ReplayBuffer
from tf_rl.common.wrappers import wrap_deepmind, make_atari

size = 100000

env = wrap_deepmind(make_atari("PongNoFrameskip-v4"))
memory = ReplayBuffer(size=size, traj_dir="./traj/")

state = env.reset()
action = env.action_space.sample()
next_state, reward, done, info = env.step(action)
env.close()

for _ in range(size):
    memory.add(state, action, reward, next_state, done)
print(len(memory))
memory.save()

del memory
memory = ReplayBuffer(size=size, recover_data=True, traj_dir="./traj/")
print(len(memory))
Exemple #7
0
from tf_rl.common.memory import ReplayBuffer
from tf_rl.common.wrappers import wrap_deepmind, make_atari

size = 1000

env = wrap_deepmind(make_atari("PongNoFrameskip-v4"))
memory = ReplayBuffer(size, n_step=5, flg_seq=True)

print("Memory contains {0} timesteps".format(len(memory)))

state = env.reset()
action = env.action_space.sample()
next_state, reward, done, info = env.step(action)
env.close()

for _ in range(size):
    memory.add(state, action, reward, next_state, done)
print(len(memory))
memory.save()

print("Memory contains {0} timesteps".format(len(memory)))
states, actions, rewards, next_states, dones = memory.sample(batch_size=10)
print(states.shape, state.shape)

for _ in range(size):
    memory.sample(batch_size=10)
Exemple #8
0
import tensorflow as tf
from tf_rl.common.memory_tf import ReplayBuffer as ReplayBuffer_tf
from tf_rl.common.memory import ReplayBuffer
from tf_rl.common.wrappers import wrap_deepmind, make_atari

env = wrap_deepmind(make_atari("PongNoFrameskip-v4"))
state = env.reset()
memory_tf = ReplayBuffer_tf(capacity=1000,
                            n_step=0,
                            act_shape=(),
                            obs_shape=state.shape,
                            obs_dtype=tf.int8,
                            checkpoint_dir="./tmp")
memory = ReplayBuffer(size=1000)
done = False
for t in range(100):
    action = env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    memory.add(state, action, reward, next_state, done)
    memory_tf.add(state, action, reward, next_state, done)
    state = next_state
env.close()

print("=== test ===")
"""
Note:
    I have conducted the performance test where we repeat sampling from the Replay Buffer over 1000 times.
    And measured the exec time to compare Eager and Eager with Tf.function.

Result:
    without function: 9.03s
Exemple #9
0
        with tf.GradientTape() as tape:
            preds = self.network(states)
            loss = tf.losses.mean_squared_error(expert_action, preds)

        # get gradients
        grads = tape.gradient(loss, self.network.trainable_variables)

        # apply processed gradients to the network
        self.optimizer.apply_gradients(
            zip(grads, self.network.trainable_variables))
        return tf.math.reduce_mean(loss)


if __name__ == '__main__':
    env = gym.make("CartPole-v0")
    buffer = ReplayBuffer(size=1000)
    agent = Agent()
    expert = dqn_agent(model=cartpole_net,
                       policy=EpsilonGreedyPolicy_eager(
                           num_action=env.action_space.n,
                           epsilon_fn=lambda: tf.constant(0.02)),
                       optimizer=tf.compat.v1.train.AdamOptimizer(),
                       loss_fn=tf.compat.v1.losses.huber_loss,
                       grad_clip_fn=lambda x: x,
                       num_action=env.action_space.n,
                       model_dir="./expert",
                       gamma=0.99,
                       obs_prc_fn=lambda x: x)
    reward_total = list()

    @tf.function
from tf_rl.common.memory import ReplayBuffer
from tf_rl.common.wrappers import wrap_deepmind, make_atari

env = wrap_deepmind(make_atari("PongNoFrameskip-v4"))
memory = ReplayBuffer(1000, n_step=5, flg_seq=True)

print("Memory contains {0} timesteps".format(len(memory)))

for i in range(1):
    state = env.reset()
    for t in range(1000):
        # env.render()
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        memory.add(state, action, reward, next_state, done)
        state = next_state

        if done:
            print("Episode finished after {} timesteps".format(t + 1))
            print("Memory contains {0} timesteps".format(len(memory)))
            break

env.close()

print("Memory contains {0} timesteps".format(len(memory)))
state, action, reward, next_state, done = memory.sample(batch_size=10)
print(state.shape, action.shape)
Exemple #11
0
import gym

from tf_rl.common.memory import ReplayBuffer
from tf_rl.common.utils import Tracker

env = gym.make("CartPole-v0")
memory = ReplayBuffer(1000)
tracker = Tracker(save_freq=100)

for i in range(100):
    state = env.reset()
    for t in range(100):
        # env.render()
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)

        # memory format is: state, action, reward, next_state, done
        memory.add(state, action, reward, next_state, done)

        # format is: state, q_value, action, reward, done, loss, gradient
        tracker.store('state', state)
        tracker.store('q_value', 0.2)
        tracker.store('action', action)
        tracker.store('reward', reward)
        tracker.store('done', done)
        tracker.store('loss', 0.3)

        if done:
            print("Episode finished after {} timesteps".format(t + 1))
            break
        state = next_state
Exemple #12
0
from tf_rl.common.wrappers import wrap_deepmind, make_atari
from tf_rl.common.memory import ReplayBuffer

# for env_name , goal_score in ENV_LIST_NIPS.items():
env = wrap_deepmind(make_atari("PongNoFrameskip-v4"))
memory_size = 1000
replay_buffer = ReplayBuffer(memory_size)
state = env.reset()
for t in range(memory_size):
    env.render()
    action = env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    replay_buffer.add(state, action, reward, next_state, done)
    state = next_state
    if t % 10000 == 0:
        print(t)
env.close()
# replay_buffer.save(dir="./buffer.json")