params.alpha = 0.6
params.beta_start = 0.4
params.beta_end = 1.0
params.prioritized_replay_noise = 1e-6

# init global time-step
global_timestep = tf.train.get_or_create_global_step()

# instantiate annealing funcs for ep and lr
anneal_ep = tf.train.polynomial_decay(params.ep_start, global_timestep, params.decay_steps, params.ep_end)
anneal_lr = tf.train.polynomial_decay(params.lr_start, global_timestep, params.decay_steps, params.lr_end)
beta = tf.train.polynomial_decay(params.beta_start, global_timestep, params.decay_steps, params.beta_end)

# prep for training
policy = EpsilonGreedyPolicy_eager(Epsilon_fn=anneal_ep)
optimizer = tf.train.RMSPropOptimizer(anneal_lr, 0.99, 0.0, 1e-6)
replay_buffer = PrioritizedReplayBuffer(params.memory_size, alpha=params.alpha)
reward_buffer = deque(maxlen=params.reward_buffer_ep)
loss_fn = create_loss_func(params.loss_fn)
grad_clip_fn = gradient_clip_fn(flag=params.grad_clip_flg)

# create a directory for log/model
params = create_log_model_directory(params, get_alg_name())
summary_writer = tf.contrib.summary.create_file_writer(params.log_dir)

# choose env and instantiate the agent correspondingly
agent, env = invoke_agent_env(params, get_alg_name())
agent = eval(agent)(Model, optimizer, loss_fn, grad_clip_fn, env.action_space.n, params)

train_DQN_PER(agent, env, policy, replay_buffer, reward_buffer, beta, summary_writer)
Ejemplo n.º 2
0
                    type=bool,
                    help="if you are executing this on GoogleColab")
params = parser.parse_args()
params.goal = ROBOTICS_ENV_LIST[params.env_name]
params.test_episodes = 10

env = gym.make(params.env_name)
params.max_action = env.action_space.high[0]
params.num_action = env.action_space.shape[0]

# set seed
env.seed(params.seed)
tf.random.set_random_seed(params.seed)

# create a directory for log/model
params = create_log_model_directory(params, get_alg_name())

# get init obs for creating env_params
obs = env.reset()

# prep for basic stats
env_params = {
    'obs': obs['observation'].shape[0],
    'goal': obs['desired_goal'].shape[0],
    'action': env.action_space.shape[0],
    'action_max': env.action_space.high[0],
    'max_timesteps': env._max_episode_steps
}

her_sample_func = her_sampler(params.replay_strategy, params.replay_k,
                              env.compute_reward)