params.test_episodes = 10
params.alpha = 0.6
params.beta_start = 0.4
params.beta_end = 1.0
params.prioritized_replay_noise = 1e-6

# init global time-step
global_timestep = tf.train.get_or_create_global_step()

# instantiate annealing funcs for ep and lr
anneal_ep = tf.train.polynomial_decay(params.ep_start, global_timestep, params.decay_steps, params.ep_end)
anneal_lr = tf.train.polynomial_decay(params.lr_start, global_timestep, params.decay_steps, params.lr_end)
beta = tf.train.polynomial_decay(params.beta_start, global_timestep, params.decay_steps, params.beta_end)

# prep for training
policy = EpsilonGreedyPolicy_eager(Epsilon_fn=anneal_ep)
optimizer = tf.train.RMSPropOptimizer(anneal_lr, 0.99, 0.0, 1e-6)
replay_buffer = PrioritizedReplayBuffer(params.memory_size, alpha=params.alpha)
reward_buffer = deque(maxlen=params.reward_buffer_ep)
loss_fn = create_loss_func(params.loss_fn)
grad_clip_fn = gradient_clip_fn(flag=params.grad_clip_flg)

# create a directory for log/model
params = create_log_model_directory(params, get_alg_name())
summary_writer = tf.contrib.summary.create_file_writer(params.log_dir)

# choose env and instantiate the agent correspondingly
agent, env = invoke_agent_env(params, get_alg_name())
agent = eval(agent)(Model, optimizer, loss_fn, grad_clip_fn, env.action_space.n, params)

train_DQN_PER(agent, env, policy, replay_buffer, reward_buffer, beta, summary_writer)
params.goal = 195
params.test_episodes = 10
params.prioritized_replay_alpha = 0.6
params.prioritized_replay_beta_start = 0.4
params.prioritized_replay_beta_end = 1.0
params.prioritized_replay_noise = 1e-6

replay_buffer = PrioritizedReplayBuffer(params.memory_size,
                                        alpha=params.prioritized_replay_alpha)
Beta = AnnealingSchedule(start=params.prioritized_replay_beta_start,
                         end=params.prioritized_replay_beta_end,
                         decay_steps=params.decay_steps)
Epsilon = AnnealingSchedule(start=params.epsilon_start,
                            end=params.epsilon_end,
                            decay_steps=params.decay_steps)
policy = EpsilonGreedyPolicy_eager(Epsilon_fn=Epsilon)

Epsilon = AnnealingSchedule(start=params.epsilon_start,
                            end=params.epsilon_end,
                            decay_steps=params.decay_steps)
reward_buffer = deque(maxlen=params.reward_buffer_ep)
anneal_lr = AnnealingSchedule(start=0.0025,
                              end=0.00025,
                              decay_steps=params.decay_steps,
                              decay_type="linear")
optimizer = tf.train.RMSPropOptimizer(anneal_lr.get_value(), 0.99, 0.0, 1e-6)

if params.loss_fn == "huber":
    loss_fn = tf.losses.huber_loss
elif params.loss_fn == "mse":
    loss_fn = tf.losses.mean_squared_error
Beispiel #3
0
        env = wrap_deepmind(make_atari("PongNoFrameskip-v4"))

    params = Parameters(algo="DQfD", mode=args.mode)
    params.num_episodes = args.num_episodes
    replay_buffer = PrioritizedReplayBuffer(
        params.memory_size, alpha=params.prioritized_replay_alpha)
    Beta = AnnealingSchedule(start=params.prioritized_replay_beta_start,
                             end=params.prioritized_replay_beta_end,
                             decay_steps=params.decay_steps)
    agent = DQfD(args.mode, Model, Model, env.action_space.n, params,
                 logdirs.model_DQN)
    if params.policy_fn == "Eps":
        Epsilon = AnnealingSchedule(start=params.epsilon_start,
                                    end=params.epsilon_end,
                                    decay_steps=params.decay_steps)
        policy = EpsilonGreedyPolicy_eager(Epsilon_fn=Epsilon)
    elif params.policy_fn == "Boltzmann":
        policy = BoltzmannQPolicy_eager()

    reward_buffer = deque(maxlen=params.reward_buffer_ep)
    summary_writer = tf.contrib.summary.create_file_writer(logdirs.log_DQfD)

    expert = DQN(args.mode, Model_CartPole_DQN, Model_CartPole_DQN,
                 env.action_space.n, params, logdirs.model_DQN)
    expert_policy = TestPolicy()
    expert.check_point.restore(expert.manager.latest_checkpoint)
    print("Restore the model from disk")

    # agent, _ = pretrain_without_prioritisation(agent, expert, policy, expert_policy, env, 100, 100)
    agent, replay_buffer = pretrain_with_prioritisation(
        agent, expert, policy, expert_policy, env, replay_buffer, 100, 1000)
Beispiel #4
0
                    help="if you are executing this on GoogleColab")
params = parser.parse_args()
params.goal = 195
params.test_episodes = 20

# init global time-step
global_timestep = tf.train.get_or_create_global_step()

# instantiate annealing funcs for ep and lr
anneal_ep = tf.train.polynomial_decay(params.ep_start, global_timestep,
                                      params.decay_steps, params.ep_end)
anneal_lr = tf.train.polynomial_decay(params.lr_start, global_timestep,
                                      params.decay_steps, params.lr_end)

# prep for training
policy = EpsilonGreedyPolicy_eager(Epsilon_fn=anneal_ep)
optimizer = tf.train.RMSPropOptimizer(anneal_lr, 0.99, 0.0, 1e-6)
replay_buffer = ReplayBuffer(params.memory_size)
reward_buffer = deque(maxlen=params.reward_buffer_ep)
loss_fn = create_loss_func(params.loss_fn)
grad_clip_fn = gradient_clip_fn(flag=params.grad_clip_flg)

# create a directory for log/model
params = create_log_model_directory(params, get_alg_name())
summary_writer = tf.contrib.summary.create_file_writer(params.log_dir)

# choose env and instantiate the agent correspondingly
env = MyWrapper(gym.make("CartPole-v0"))
agent = DQN_cartpole(Model, optimizer, loss_fn, grad_clip_fn,
                     env.action_space.n, params)