def main(_): """Run td3/ddpg evaluation.""" contrib_eager_python_tfe.enable_eager_execution() if FLAGS.use_gpu: tf.device('/device:GPU:0').__enter__() tf.gfile.MakeDirs(FLAGS.log_dir) summary_writer = contrib_summary.create_file_writer( FLAGS.log_dir, flush_millis=10000) env = gym.make(FLAGS.env) if FLAGS.wrap_for_absorbing: env = lfd_envs.AbsorbingWrapper(env) obs_shape = env.observation_space.shape act_shape = env.action_space.shape with tf.variable_scope('actor'): actor = Actor(obs_shape[0], act_shape[0]) random_reward, _ = do_rollout( env, actor, None, num_trajectories=10, sample_random=True) reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale') saver = contrib_eager_python_tfe.Saver(actor.variables + [reward_scale]) last_checkpoint = tf.train.latest_checkpoint(FLAGS.load_dir) with summary_writer.as_default(): while True: last_checkpoint = wait_for_next_checkpoint(FLAGS.load_dir, last_checkpoint) total_numsteps = int(last_checkpoint.split('-')[-1]) saver.restore(last_checkpoint) average_reward, average_length = do_rollout( env, actor, None, noise_scale=0.0, num_trajectories=FLAGS.num_trials) logging.info( 'Evaluation: average episode length %d, average episode reward %f', average_length, average_reward) print('Evaluation: average episode length {}, average episode reward {}'. format(average_length, average_reward)) with contrib_summary.always_record_summaries(): if reward_scale.numpy() != 1.0: contrib_summary.scalar( 'reward/scaled', (average_reward - random_reward) / (reward_scale.numpy() - random_reward), step=total_numsteps) contrib_summary.scalar('reward', average_reward, step=total_numsteps) contrib_summary.scalar('length', average_length, step=total_numsteps)
def __init__(self, input_dim, action_dim, discount=0.99, tau=0.005, actor_lr=1e-3, critic_lr=1e-3, use_td3=True, policy_noise=0.2, policy_noise_clip=0.5, policy_update_freq=2, get_reward=None, use_absorbing_state=False): """Initializes actor, critic, target networks and optimizers. The class handles absorbing state properly. Absorbing state corresponds to a state which a policy gets in after reaching a goal state and stays there forever. For most RL problems, we can just assign 0 to all reward after the goal. But for GAIL, we need to have an actual absorbing state. Args: input_dim: size of the observation space. action_dim: size of the action space. discount: reward discount. tau: target networks update coefficient. actor_lr: actor learning rate. critic_lr: critic learning rate. use_td3: whether to use standard ddpg or td3. policy_noise: std of gaussian added to critic action input. policy_noise_clip: clip added gaussian noise. policy_update_freq: perform policy update once per n steps. get_reward: a function that given (s,a,s') returns a reward. use_absorbing_state: whether to use an absorbing state or not. """ self.discount = discount self.tau = tau self.use_td3 = use_td3 self.policy_noise = policy_noise self.policy_noise_clip = policy_noise_clip self.policy_update_freq = policy_update_freq self.get_reward = get_reward self.use_absorbing_state = use_absorbing_state with tf.variable_scope('actor'): self.actor = Actor(input_dim, action_dim) with tf.variable_scope('target'): self.actor_target = Actor(input_dim, action_dim) self.initial_actor_lr = actor_lr self.actor_lr = contrib_eager_python_tfe.Variable(actor_lr, name='lr') self.actor_step = contrib_eager_python_tfe.Variable(0, dtype=tf.int64, name='step') self.actor_optimizer = tf.train.AdamOptimizer( learning_rate=self.actor_lr) self.actor_optimizer._create_slots(self.actor.variables) # pylint: disable=protected-access soft_update(self.actor.variables, self.actor_target.variables) with tf.variable_scope('critic'): if self.use_td3: self.critic = CriticTD3(input_dim + action_dim) with tf.variable_scope('target'): self.critic_target = CriticTD3(input_dim + action_dim) else: self.critic = CriticDDPG(input_dim + action_dim) with tf.variable_scope('target'): self.critic_target = CriticDDPG(input_dim + action_dim) self.critic_step = contrib_eager_python_tfe.Variable( 0, dtype=tf.int64, name='step') self.critic_optimizer = tf.train.AdamOptimizer( learning_rate=critic_lr) self.critic_optimizer._create_slots(self.critic.variables) # pylint: disable=protected-access soft_update(self.critic.variables, self.critic_target.variables)
def main(_): """Run td3/ddpg training.""" tfe.enable_eager_execution() if FLAGS.use_gpu: tf.device('/device:GPU:0').__enter__() if FLAGS.expert_dir.find(FLAGS.env) == -1: raise ValueError('Expert directory must contain the environment name') tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) random.seed(FLAGS.seed) env = gym.make(FLAGS.env) env.seed(FLAGS.seed) obs_shape = env.observation_space.shape act_shape = env.action_space.shape expert_replay_buffer_var = tfe.Variable('', name='expert_replay_buffer') saver = tfe.Saver([expert_replay_buffer_var]) tf.gfile.MakeDirs(FLAGS.save_dir) with tf.variable_scope('actor'): actor = Actor(obs_shape[0], act_shape[0]) expert_saver = tfe.Saver(actor.variables) best_checkpoint = None best_reward = float('-inf') checkpoint_state = tf.train.get_checkpoint_state(FLAGS.expert_dir) for checkpoint in checkpoint_state.all_model_checkpoint_paths: expert_saver.restore(checkpoint) expert_reward, _ = do_rollout(env, actor, replay_buffer=None, noise_scale=0.0, num_trajectories=10) if expert_reward > best_reward: best_reward = expert_reward best_checkpoint = checkpoint expert_saver.restore(best_checkpoint) expert_replay_buffer = ReplayBuffer() expert_reward, _ = do_rollout( env, actor, replay_buffer=expert_replay_buffer, noise_scale=0.0, num_trajectories=FLAGS.num_expert_trajectories) logging.info('Expert reward %f', expert_reward) print('Expert reward {}'.format(expert_reward)) expert_replay_buffer_var.assign(pickle.dumps(expert_replay_buffer)) saver.save(os.path.join(FLAGS.save_dir, 'expert_replay_buffer'))