params.alpha = 0.6 params.beta_start = 0.4 params.beta_end = 1.0 params.prioritized_replay_noise = 1e-6 # init global time-step global_timestep = tf.train.get_or_create_global_step() # instantiate annealing funcs for ep and lr anneal_ep = tf.train.polynomial_decay(params.ep_start, global_timestep, params.decay_steps, params.ep_end) anneal_lr = tf.train.polynomial_decay(params.lr_start, global_timestep, params.decay_steps, params.lr_end) beta = tf.train.polynomial_decay(params.beta_start, global_timestep, params.decay_steps, params.beta_end) # prep for training policy = EpsilonGreedyPolicy_eager(Epsilon_fn=anneal_ep) optimizer = tf.train.RMSPropOptimizer(anneal_lr, 0.99, 0.0, 1e-6) replay_buffer = PrioritizedReplayBuffer(params.memory_size, alpha=params.alpha) reward_buffer = deque(maxlen=params.reward_buffer_ep) loss_fn = create_loss_func(params.loss_fn) grad_clip_fn = gradient_clip_fn(flag=params.grad_clip_flg) # create a directory for log/model params = create_log_model_directory(params, get_alg_name()) summary_writer = tf.contrib.summary.create_file_writer(params.log_dir) # choose env and instantiate the agent correspondingly agent, env = invoke_agent_env(params, get_alg_name()) agent = eval(agent)(Model, optimizer, loss_fn, grad_clip_fn, env.action_space.n, params) train_DQN_PER(agent, env, policy, replay_buffer, reward_buffer, beta, summary_writer)
type=bool, help="if you are executing this on GoogleColab") params = parser.parse_args() params.goal = ROBOTICS_ENV_LIST[params.env_name] params.test_episodes = 10 env = gym.make(params.env_name) params.max_action = env.action_space.high[0] params.num_action = env.action_space.shape[0] # set seed env.seed(params.seed) tf.random.set_random_seed(params.seed) # create a directory for log/model params = create_log_model_directory(params, get_alg_name()) # get init obs for creating env_params obs = env.reset() # prep for basic stats env_params = { 'obs': obs['observation'].shape[0], 'goal': obs['desired_goal'].shape[0], 'action': env.action_space.shape[0], 'action_max': env.action_space.high[0], 'max_timesteps': env._max_episode_steps } her_sample_func = her_sampler(params.replay_strategy, params.replay_k, env.compute_reward)