def build(self): monitor = LocalMonitor(self.logging_dir) policy = dense(self.observation_dim, self.action_dim, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers) expert_policy = tf.keras.models.load_model(self.expert_policy_ckpt, compile=False) if self.is_discrete: policy = Categorical(policy, temperature=self.exploration_noise_std) expert_policy = Categorical(expert_policy, temperature=self.exploration_noise_std) else: policy = Gaussian(policy, std=self.exploration_noise_std) expert_policy = Gaussian(expert_policy, std=self.exploration_noise_std) replay_buffer = StepReplayBuffer(max_num_steps=self.max_num_steps, selector=self.selector, monitor=monitor) saver = LocalSaver(self.logging_dir, policy=policy, replay_buffer=replay_buffer) algorithm = BehaviorCloningAlgorithm(policy, batch_size=self.batch_size, monitor=monitor) sampler = ParallelSampler(self.get_env, policy, num_threads=self.num_threads, max_path_length=self.max_path_length, selector=self.selector, monitor=monitor) expert_sampler = ParallelSampler(self.get_env, expert_policy, num_threads=self.num_threads, max_path_length=self.max_path_length, selector=self.selector, monitor=monitor) LocalTrainer(expert_sampler, expert_sampler, sampler, replay_buffer, algorithm, num_epochs=self.num_epochs, num_episodes_per_epoch=self.num_episodes_per_epoch, num_trains_per_epoch=self.num_trains_per_epoch, num_episodes_before_train=self.num_episodes_before_train, num_epochs_per_eval=self.num_epochs_per_eval, num_episodes_per_eval=self.num_episodes_per_eval, saver=saver, monitor=monitor).train()
def expected_value(self, *inputs): # expected value of a gaussian distribution gaussian_samples, log_probs = Gaussian.expected_value(self, *inputs) # compute the log probability density of the expected value return tf.tanh(gaussian_samples), log_probs - 2.0 * tf.reduce_sum( math.log(2.0) - gaussian_samples - tf.math.softplus(-2.0 * gaussian_samples), axis=(-1))
def launch(self): monitor = LocalMonitor(self.logging_dir) policy = dense(self.observation_dim, self.action_dim, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers) if self.is_discrete: policy = Categorical(policy, temperature=self.exploration_noise_std) else: policy = Gaussian(policy, std=self.exploration_noise_std) replay_buffer = PathReplayBuffer(max_num_paths=self.max_num_paths, max_path_length=self.max_path_length, selector=self.selector, monitor=monitor) saver = LocalSaver(self.logging_dir, policy=policy, replay_buffer=replay_buffer) algorithm = PolicyGradientAlgorithm( policy, reward_scale=self.reward_scale, discount=self.discount, policy_optimizer_class=tf.keras.optimizers.Adam, policy_optimizer_kwargs=dict( learning_rate=self.policy_learning_rate), batch_size=self.batch_size, monitor=monitor) sampler = ParallelSampler(self.get_env, policy, num_threads=self.num_threads, max_path_length=self.max_path_length, selector=self.selector, monitor=monitor) LocalTrainer(sampler, sampler, sampler, replay_buffer, algorithm, num_epochs=self.num_epochs, num_episodes_per_epoch=self.num_episodes_per_epoch, num_trains_per_epoch=self.num_trains_per_epoch, num_episodes_before_train=self.num_episodes_before_train, num_epochs_per_eval=self.num_epochs_per_eval, num_episodes_per_eval=self.num_episodes_per_eval, saver=saver, monitor=monitor).train()
def sample(self, *inputs): # sample from a gaussian distribution gaussian_samples, log_probs = Gaussian.sample(self, *inputs) # pass samples through the tanh tanh_samples = tf.tanh(gaussian_samples) # compute the log probability density of the samples return tanh_samples, log_probs - 2.0 * tf.reduce_sum( math.log(2.0) - gaussian_samples - tf.math.softplus(-2.0 * gaussian_samples), axis=(-1))
def log_prob(self, tanh_samples, *inputs): # convert tanh gaussian samples to gaussian samples gaussian_samples = tf.math.atanh( tf.clip_by_value(tanh_samples, -0.99, 0.99)) # compute the log probability density under a gaussian log_probs = Gaussian.log_prob(self, gaussian_samples, *inputs) # compute the log probability density of the samples return log_probs - 2.0 * tf.reduce_sum( math.log(2.0) - gaussian_samples - tf.math.softplus(-2.0 * gaussian_samples), axis=(-1))
def __init__(self, *args, std=None, **kwargs): Gaussian.__init__(self, *args, std=std, **kwargs)
def launch(self): monitor = LocalMonitor(self.logging_dir) policy = Gaussian(dense(self.observation_dim, self.action_dim, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers), std=self.exploration_noise_std) target_policy = policy.clone() qf1 = dense(self.observation_dim + self.action_dim, 1, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers) qf2 = dense(self.observation_dim + self.action_dim, 1, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers) target_qf1 = tf.keras.models.clone_model(qf1) target_qf2 = tf.keras.models.clone_model(qf2) replay_buffer = StepReplayBuffer(max_num_steps=self.max_num_steps, selector=self.selector, monitor=monitor) saver = LocalSaver(self.logging_dir, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, replay_buffer=replay_buffer) algorithm = TD3Algorithm( policy, target_policy, qf1, qf2, target_qf1, target_qf2, reward_scale=self.reward_scale, discount=self.discount, tau=self.tau, target_noise=self.target_noise, target_clipping=self.target_clipping, policy_delay=self.policy_delay, qf_optimizer_class=tf.keras.optimizers.Adam, qf_optimizer_kwargs=dict(learning_rate=self.qf_learning_rate), policy_optimizer_class=tf.keras.optimizers.Adam, policy_optimizer_kwargs=dict( learning_rate=self.policy_learning_rate), batch_size=self.batch_size, monitor=monitor) sampler = ParallelSampler(self.get_env, policy, num_threads=self.num_threads, max_path_length=self.max_path_length, selector=self.selector, monitor=monitor) LocalTrainer(sampler, sampler, sampler, replay_buffer, algorithm, num_epochs=self.num_epochs, num_episodes_per_epoch=self.num_episodes_per_epoch, num_trains_per_epoch=self.num_trains_per_epoch, num_episodes_before_train=self.num_episodes_before_train, num_epochs_per_eval=self.num_epochs_per_eval, num_episodes_per_eval=self.num_episodes_per_eval, saver=saver, monitor=monitor).train()