Ejemplo n.º 1
0
 def test_ddpg_pendulum(self):
     """Test PPO with Pendulum environment."""
     logger._tensorboard = TensorBoardOutput()
     env = TfEnv(gym.make('InvertedDoublePendulum-v2'))
     action_noise = OUStrategy(env.spec, sigma=0.2)
     policy = ContinuousMLPPolicy(env_spec=env.spec,
                                  hidden_sizes=[64, 64],
                                  hidden_nonlinearity=tf.nn.relu,
                                  output_nonlinearity=tf.nn.tanh)
     qf = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[64, 64],
                                 hidden_nonlinearity=tf.nn.relu)
     replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                        size_in_transitions=int(1e6),
                                        time_horizon=100)
     algo = DDPG(
         env,
         policy=policy,
         policy_lr=1e-4,
         qf_lr=1e-3,
         qf=qf,
         replay_buffer=replay_buffer,
         plot=False,
         target_update_tau=1e-2,
         n_epochs=10,
         n_epoch_cycles=20,
         max_path_length=100,
         n_train_steps=50,
         discount=0.9,
         min_buffer_size=int(1e4),
         exploration_strategy=action_noise,
     )
     last_avg_ret = algo.train(sess=self.sess)
     assert last_avg_ret > 30
Ejemplo n.º 2
0
def run_garage(env, seed, log_dir):
    """
    Create garage model and training.

    Replace the ddpg with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trail.
    :param log_dir: Log dir path.
    :return:
    """
    ext.set_seed(seed)

    with tf.Graph().as_default():
        # Set up params for ddpg
        action_noise = OUStrategy(env, sigma=params["sigma"])

        actor_net = ContinuousMLPPolicy(
            env_spec=env,
            name="Actor",
            hidden_sizes=params["actor_hidden_sizes"],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh)

        critic_net = ContinuousMLPQFunction(
            env_spec=env,
            name="Critic",
            hidden_sizes=params["critic_hidden_sizes"],
            hidden_nonlinearity=tf.nn.relu)

        ddpg = DDPG(env,
                    actor=actor_net,
                    critic=critic_net,
                    actor_lr=params["actor_lr"],
                    critic_lr=params["critic_lr"],
                    plot=False,
                    target_update_tau=params["tau"],
                    n_epochs=params["n_epochs"],
                    n_epoch_cycles=params["n_epoch_cycles"],
                    n_rollout_steps=params["n_rollout_steps"],
                    n_train_steps=params["n_train_steps"],
                    discount=params["discount"],
                    replay_buffer_size=params["replay_buffer_size"],
                    min_buffer_size=int(1e4),
                    exploration_strategy=action_noise,
                    actor_optimizer=tf.train.AdamOptimizer,
                    critic_optimizer=tf.train.AdamOptimizer)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, "progress.csv")
        tensorboard_log_dir = osp.join(log_dir, "progress")
        garage_logger.add_tabular_output(tabular_log_file)
        garage_logger.set_tensorboard_dir(tensorboard_log_dir)

        ddpg.train()

        garage_logger.remove_tabular_output(tabular_log_file)

        return tabular_log_file
Ejemplo n.º 3
0
def run_task(*_):
    """
    Wrap DDPG training task in the run_task function.

    :param _:
    :return:
    """
    env = TfEnv(gym.make('FetchReach-v1'))

    action_noise = OUStrategy(env.spec, sigma=0.2)

    policy = ContinuousMLPPolicy(
        env_spec=env.spec,
        name="Policy",
        hidden_sizes=[256, 256, 256],
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.tanh,
        input_include_goal=True,
    )

    qf = ContinuousMLPQFunction(
        env_spec=env.spec,
        name="QFunction",
        hidden_sizes=[256, 256, 256],
        hidden_nonlinearity=tf.nn.relu,
        input_include_goal=True,
    )

    replay_buffer = HerReplayBuffer(env_spec=env.spec,
                                    size_in_transitions=int(1e6),
                                    time_horizon=100,
                                    replay_k=0.4,
                                    reward_fun=env.compute_reward)

    ddpg = DDPG(
        env,
        policy=policy,
        policy_lr=1e-3,
        qf_lr=1e-3,
        qf=qf,
        replay_buffer=replay_buffer,
        plot=False,
        target_update_tau=0.05,
        n_epochs=50,
        n_epoch_cycles=20,
        max_path_length=100,
        n_train_steps=40,
        discount=0.9,
        exploration_strategy=action_noise,
        policy_optimizer=tf.train.AdamOptimizer,
        qf_optimizer=tf.train.AdamOptimizer,
        buffer_batch_size=256,
        input_include_goal=True,
    )

    ddpg.train()
Ejemplo n.º 4
0
def run_task(*_):
    """
    Wrap DDPG training task in the run_task function.

    :param _:
    :return:
    """
    env = gym.make('FetchReach-v1')

    action_noise = OUStrategy(env, sigma=0.2)

    actor_net = ContinuousMLPPolicy(
        env_spec=env,
        name="Actor",
        hidden_sizes=[256, 256, 256],
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.tanh,
        input_include_goal=True,
    )

    critic_net = ContinuousMLPQFunction(
        env_spec=env,
        name="Critic",
        hidden_sizes=[256, 256, 256],
        hidden_nonlinearity=tf.nn.relu,
        input_include_goal=True,
    )

    ddpg = DDPG(
        env,
        actor=actor_net,
        actor_lr=1e-3,
        critic_lr=1e-3,
        critic=critic_net,
        plot=False,
        target_update_tau=0.05,
        n_epochs=50,
        n_epoch_cycles=20,
        n_rollout_steps=100,
        n_train_steps=40,
        discount=0.9,
        replay_buffer_size=int(1e6),
        min_buffer_size=int(1e4),
        exploration_strategy=action_noise,
        actor_optimizer=tf.train.AdamOptimizer,
        critic_optimizer=tf.train.AdamOptimizer,
        use_her=True,
        batch_size=256,
        clip_obs=200.,
    )

    ddpg.train()
Ejemplo n.º 5
0
def run_task(*_):

    sess = tf.Session()
    sess.__enter__()

    inner_env = SimpleReacherEnv(
        goal_position=(0.5, 0, 0.15),
        control_method="position_control",
        completion_bonus=2.,
        action_scale=0.04,
    )
    latent_policy = joblib.load(latent_policy_pkl)["policy"]

    env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy))

    action_noise = OUStrategy(env, sigma=0.2)

    actor_net = ContinuousMLPPolicy(
        env_spec=env.spec,
        name="Actor",
        hidden_sizes=[64, 32],
        hidden_nonlinearity=tf.nn.relu,)

    critic_net = ContinuousMLPQFunction(
        env_spec=env,
        name="Critic",
        hidden_sizes=[64, 32],
        hidden_nonlinearity=tf.nn.relu)

    ddpg = DDPG(
        env,
        actor=actor_net,
        actor_lr=1e-4,
        critic_lr=1e-3,
        critic=critic_net,
        plot=True,
        target_update_tau=1e-2,
        n_epochs=500,
        n_epoch_cycles=10,
        n_rollout_steps=100,
        n_train_steps=50,
        discount=0.9,
        replay_buffer_size=int(1e6),
        min_buffer_size=int(1e3),
        exploration_strategy=action_noise,
        actor_optimizer=tf.train.AdamOptimizer,
        critic_optimizer=tf.train.AdamOptimizer)
    ddpg.train(sess=sess)
Ejemplo n.º 6
0
def run_task(*_):
    """
    Wrap DDPG training task in the run_task function.

    :param _:
    :return:
    """
    env = TfEnv(gym.make('InvertedDoublePendulum-v2'))

    action_noise = OUStrategy(env.spec, sigma=0.2)

    policy = ContinuousMLPPolicy(env_spec=env.spec,
                                 hidden_sizes=[64, 64],
                                 hidden_nonlinearity=tf.nn.relu,
                                 output_nonlinearity=tf.nn.tanh)

    qf = ContinuousMLPQFunction(env_spec=env.spec,
                                hidden_sizes=[64, 64],
                                hidden_nonlinearity=tf.nn.relu)

    replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                       size_in_transitions=int(1e6),
                                       time_horizon=100)

    ddpg = DDPG(env,
                policy=policy,
                policy_lr=1e-4,
                qf_lr=1e-3,
                qf=qf,
                replay_buffer=replay_buffer,
                plot=False,
                target_update_tau=1e-2,
                n_epochs=500,
                n_epoch_cycles=20,
                max_path_length=100,
                n_train_steps=50,
                discount=0.9,
                min_buffer_size=int(1e4),
                exploration_strategy=action_noise,
                policy_optimizer=tf.train.AdamOptimizer,
                qf_optimizer=tf.train.AdamOptimizer)

    ddpg.train()
def run_task(*_):
    sess = tf.Session()
    sess.__enter__()
    latent_policy = joblib.load(latent_policy_pkl)["policy"]

    inner_env = SequencePointEnv(completion_bonus=100)
    env = TfEnv(AlmostContinuousEmbeddedPolicyEnv(inner_env, latent_policy))

    action_noise = OUStrategy(env, sigma=0.8)
    
    actor_net = ContinuousMLPPolicy(
        env_spec=env,
        name="Actor",
        hidden_sizes=[64, 64],
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.tanh)

    critic_net = ContinuousMLPQFunction(
        env_spec=env,
        name="Critic",
        hidden_sizes=[64, 64],
        hidden_nonlinearity=tf.nn.relu)

    ddpg = DDPG(
        env,
        actor=actor_net,
        actor_lr=1e-4,
        critic_lr=1e-3,
        critic=critic_net,
        plot=False,
        target_update_tau=1e-2,
        n_epochs=500,
        n_epoch_cycles=100,
        n_rollout_steps=50,
        n_train_steps=50,
        discount=0.9,
        replay_buffer_size=int(1e6),
        min_buffer_size=int(1e4),
        exploration_strategy=action_noise,
        actor_optimizer=tf.train.AdamOptimizer,
        critic_optimizer=tf.train.AdamOptimizer,)

    ddpg.train(sess=sess)
Ejemplo n.º 8
0
def run_task(*_):
    """
    Wrap DDPG training task in the run_task function.

    :param _:
    :return:
    """
    env = TfEnv(gym.make('InvertedDoublePendulum-v2'))

    action_noise = OUStrategy(env, sigma=0.2)

    actor_net = ContinuousMLPPolicy(env_spec=env.spec,
                                    name="Actor",
                                    hidden_sizes=[64, 64],
                                    hidden_nonlinearity=tf.nn.relu,
                                    output_nonlinearity=tf.nn.tanh)

    critic_net = ContinuousMLPQFunction(env_spec=env.spec,
                                        name="Critic",
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=tf.nn.relu)

    ddpg = DDPG(env,
                actor=actor_net,
                actor_lr=1e-4,
                critic_lr=1e-3,
                critic=critic_net,
                plot=False,
                target_update_tau=1e-2,
                n_epochs=500,
                n_epoch_cycles=20,
                n_rollout_steps=100,
                n_train_steps=50,
                discount=0.9,
                replay_buffer_size=int(1e6),
                min_buffer_size=int(1e4),
                exploration_strategy=action_noise,
                actor_optimizer=tf.train.AdamOptimizer,
                critic_optimizer=tf.train.AdamOptimizer)

    ddpg.train()
Ejemplo n.º 9
0
def run_garage(env, seed, log_dir):
    """
    Create garage model and training.

    Replace the ddpg with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trail.
    :param log_dir: Log dir path.
    :return:
    """
    ext.set_seed(seed)

    with tf.Graph().as_default():
        env = TfEnv(env)
        # Set up params for ddpg
        action_noise = OUStrategy(env.spec, sigma=params["sigma"])

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            name="Policy",
            hidden_sizes=params["policy_hidden_sizes"],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh)

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            name="QFunction",
            hidden_sizes=params["qf_hidden_sizes"],
            hidden_nonlinearity=tf.nn.relu)

        replay_buffer = SimpleReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params["replay_buffer_size"],
            time_horizon=params["n_rollout_steps"])

        ddpg = DDPG(
            env,
            policy=policy,
            qf=qf,
            replay_buffer=replay_buffer,
            policy_lr=params["policy_lr"],
            qf_lr=params["qf_lr"],
            plot=False,
            target_update_tau=params["tau"],
            n_epochs=params["n_epochs"],
            n_epoch_cycles=params["n_epoch_cycles"],
            max_path_length=params["n_rollout_steps"],
            n_train_steps=params["n_train_steps"],
            discount=params["discount"],
            min_buffer_size=int(1e4),
            exploration_strategy=action_noise,
            policy_optimizer=tf.train.AdamOptimizer,
            qf_optimizer=tf.train.AdamOptimizer)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, "progress.csv")
        tensorboard_log_dir = osp.join(log_dir, "progress")
        garage_logger.add_tabular_output(tabular_log_file)
        garage_logger.set_tensorboard_dir(tensorboard_log_dir)

        ddpg.train()

        garage_logger.remove_tabular_output(tabular_log_file)

        return tabular_log_file