Exemple #1
0
    def build(self):
        monitor = LocalMonitor(self.logging_dir)

        policy = dense(self.observation_dim,
                       self.action_dim,
                       hidden_size=self.hidden_size,
                       num_hidden_layers=self.num_hidden_layers)

        expert_policy = tf.keras.models.load_model(self.expert_policy_ckpt,
                                                   compile=False)

        if self.is_discrete:
            policy = Categorical(policy,
                                 temperature=self.exploration_noise_std)
            expert_policy = Categorical(expert_policy,
                                        temperature=self.exploration_noise_std)
        else:
            policy = Gaussian(policy, std=self.exploration_noise_std)
            expert_policy = Gaussian(expert_policy,
                                     std=self.exploration_noise_std)

        replay_buffer = StepReplayBuffer(max_num_steps=self.max_num_steps,
                                         selector=self.selector,
                                         monitor=monitor)

        saver = LocalSaver(self.logging_dir,
                           policy=policy,
                           replay_buffer=replay_buffer)

        algorithm = BehaviorCloningAlgorithm(policy,
                                             batch_size=self.batch_size,
                                             monitor=monitor)

        sampler = ParallelSampler(self.get_env,
                                  policy,
                                  num_threads=self.num_threads,
                                  max_path_length=self.max_path_length,
                                  selector=self.selector,
                                  monitor=monitor)

        expert_sampler = ParallelSampler(self.get_env,
                                         expert_policy,
                                         num_threads=self.num_threads,
                                         max_path_length=self.max_path_length,
                                         selector=self.selector,
                                         monitor=monitor)

        LocalTrainer(expert_sampler,
                     expert_sampler,
                     sampler,
                     replay_buffer,
                     algorithm,
                     num_epochs=self.num_epochs,
                     num_episodes_per_epoch=self.num_episodes_per_epoch,
                     num_trains_per_epoch=self.num_trains_per_epoch,
                     num_episodes_before_train=self.num_episodes_before_train,
                     num_epochs_per_eval=self.num_epochs_per_eval,
                     num_episodes_per_eval=self.num_episodes_per_eval,
                     saver=saver,
                     monitor=monitor).train()
Exemple #2
0
    def expected_value(self, *inputs):
        # expected value of a gaussian distribution
        gaussian_samples, log_probs = Gaussian.expected_value(self, *inputs)

        # compute the log probability density of the expected value
        return tf.tanh(gaussian_samples), log_probs - 2.0 * tf.reduce_sum(
            math.log(2.0) - gaussian_samples -
            tf.math.softplus(-2.0 * gaussian_samples),
            axis=(-1))
Exemple #3
0
    def launch(self):
        monitor = LocalMonitor(self.logging_dir)

        policy = dense(self.observation_dim,
                       self.action_dim,
                       hidden_size=self.hidden_size,
                       num_hidden_layers=self.num_hidden_layers)

        if self.is_discrete:
            policy = Categorical(policy,
                                 temperature=self.exploration_noise_std)
        else:
            policy = Gaussian(policy, std=self.exploration_noise_std)

        replay_buffer = PathReplayBuffer(max_num_paths=self.max_num_paths,
                                         max_path_length=self.max_path_length,
                                         selector=self.selector,
                                         monitor=monitor)

        saver = LocalSaver(self.logging_dir,
                           policy=policy,
                           replay_buffer=replay_buffer)

        algorithm = PolicyGradientAlgorithm(
            policy,
            reward_scale=self.reward_scale,
            discount=self.discount,
            policy_optimizer_class=tf.keras.optimizers.Adam,
            policy_optimizer_kwargs=dict(
                learning_rate=self.policy_learning_rate),
            batch_size=self.batch_size,
            monitor=monitor)

        sampler = ParallelSampler(self.get_env,
                                  policy,
                                  num_threads=self.num_threads,
                                  max_path_length=self.max_path_length,
                                  selector=self.selector,
                                  monitor=monitor)

        LocalTrainer(sampler,
                     sampler,
                     sampler,
                     replay_buffer,
                     algorithm,
                     num_epochs=self.num_epochs,
                     num_episodes_per_epoch=self.num_episodes_per_epoch,
                     num_trains_per_epoch=self.num_trains_per_epoch,
                     num_episodes_before_train=self.num_episodes_before_train,
                     num_epochs_per_eval=self.num_epochs_per_eval,
                     num_episodes_per_eval=self.num_episodes_per_eval,
                     saver=saver,
                     monitor=monitor).train()
Exemple #4
0
    def sample(self, *inputs):
        # sample from a gaussian distribution
        gaussian_samples, log_probs = Gaussian.sample(self, *inputs)

        # pass samples through the tanh
        tanh_samples = tf.tanh(gaussian_samples)

        # compute the log probability density of the samples
        return tanh_samples, log_probs - 2.0 * tf.reduce_sum(
            math.log(2.0) - gaussian_samples -
            tf.math.softplus(-2.0 * gaussian_samples),
            axis=(-1))
Exemple #5
0
    def log_prob(self, tanh_samples, *inputs):
        # convert tanh gaussian samples to gaussian samples
        gaussian_samples = tf.math.atanh(
            tf.clip_by_value(tanh_samples, -0.99, 0.99))

        # compute the log probability density under a gaussian
        log_probs = Gaussian.log_prob(self, gaussian_samples, *inputs)

        # compute the log probability density of the samples
        return log_probs - 2.0 * tf.reduce_sum(
            math.log(2.0) - gaussian_samples -
            tf.math.softplus(-2.0 * gaussian_samples),
            axis=(-1))
Exemple #6
0
 def __init__(self, *args, std=None, **kwargs):
     Gaussian.__init__(self, *args, std=std, **kwargs)
Exemple #7
0
    def launch(self):
        monitor = LocalMonitor(self.logging_dir)

        policy = Gaussian(dense(self.observation_dim,
                                self.action_dim,
                                hidden_size=self.hidden_size,
                                num_hidden_layers=self.num_hidden_layers),
                          std=self.exploration_noise_std)

        target_policy = policy.clone()

        qf1 = dense(self.observation_dim + self.action_dim,
                    1,
                    hidden_size=self.hidden_size,
                    num_hidden_layers=self.num_hidden_layers)
        qf2 = dense(self.observation_dim + self.action_dim,
                    1,
                    hidden_size=self.hidden_size,
                    num_hidden_layers=self.num_hidden_layers)

        target_qf1 = tf.keras.models.clone_model(qf1)
        target_qf2 = tf.keras.models.clone_model(qf2)

        replay_buffer = StepReplayBuffer(max_num_steps=self.max_num_steps,
                                         selector=self.selector,
                                         monitor=monitor)

        saver = LocalSaver(self.logging_dir,
                           policy=policy,
                           qf1=qf1,
                           qf2=qf2,
                           target_qf1=target_qf1,
                           target_qf2=target_qf2,
                           replay_buffer=replay_buffer)

        algorithm = TD3Algorithm(
            policy,
            target_policy,
            qf1,
            qf2,
            target_qf1,
            target_qf2,
            reward_scale=self.reward_scale,
            discount=self.discount,
            tau=self.tau,
            target_noise=self.target_noise,
            target_clipping=self.target_clipping,
            policy_delay=self.policy_delay,
            qf_optimizer_class=tf.keras.optimizers.Adam,
            qf_optimizer_kwargs=dict(learning_rate=self.qf_learning_rate),
            policy_optimizer_class=tf.keras.optimizers.Adam,
            policy_optimizer_kwargs=dict(
                learning_rate=self.policy_learning_rate),
            batch_size=self.batch_size,
            monitor=monitor)

        sampler = ParallelSampler(self.get_env,
                                  policy,
                                  num_threads=self.num_threads,
                                  max_path_length=self.max_path_length,
                                  selector=self.selector,
                                  monitor=monitor)

        LocalTrainer(sampler,
                     sampler,
                     sampler,
                     replay_buffer,
                     algorithm,
                     num_epochs=self.num_epochs,
                     num_episodes_per_epoch=self.num_episodes_per_epoch,
                     num_trains_per_epoch=self.num_trains_per_epoch,
                     num_episodes_before_train=self.num_episodes_before_train,
                     num_epochs_per_eval=self.num_epochs_per_eval,
                     num_episodes_per_eval=self.num_episodes_per_eval,
                     saver=saver,
                     monitor=monitor).train()