Esempio n. 1
0
def run_task(*_):
    env = TheanoEnv(normalize(CartpoleEnv()))

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers,
        # each with 32 hidden units.
        hidden_sizes=(32, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=100,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=1000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        # Uncomment both lines (this and the plot parameter below) to enable
        # plotting
        plot=True,
    )
    algo.train()
Esempio n. 2
0
def run_task(*_):
    env = normalize(
        OneHotMultiTaskEnv(
            task_env_cls=PR2ArmClockEnv,
            task_args=TASK_ARGS,
            task_kwargs=TASK_KWARGS))

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=100,
        epoch_length=4000,
        min_pool_size=10000,
        n_epochs=1000000000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )
    algo.train()
Esempio n. 3
0
def run_task(*_):
    with LocalRunner() as runner:
        env = TfEnv(gym.make('FetchReach-v1'))

        action_noise = OUStrategy(env.spec, sigma=0.2)

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            name="Policy",
            hidden_sizes=[256, 256, 256],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
            input_include_goal=True,
        )

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            name="QFunction",
            hidden_sizes=[256, 256, 256],
            hidden_nonlinearity=tf.nn.relu,
            input_include_goal=True,
        )

        replay_buffer = HerReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=int(1e6),
            time_horizon=100,
            replay_k=0.4,
            reward_fun=env.compute_reward)

        ddpg = DDPG(
            env,
            policy=policy,
            policy_lr=1e-3,
            qf_lr=1e-3,
            qf=qf,
            replay_buffer=replay_buffer,
            plot=False,
            target_update_tau=0.05,
            max_path_length=100,
            n_train_steps=40,
            discount=0.9,
            exploration_strategy=action_noise,
            policy_optimizer=tf.train.AdamOptimizer,
            qf_optimizer=tf.train.AdamOptimizer,
            buffer_batch_size=256,
            input_include_goal=True,
        )

        runner.setup(algo=ddpg, env=env)

        runner.train(n_epochs=50, n_epoch_cycles=20)
Esempio n. 4
0
    def run_task(*_):

        sess = tf.Session()
        sess.__enter__()
        with LocalRunner(sess=sess) as runner:
            inner_env = SimpleReacherEnv(
                goal_position=(0.5, 0, 0.15),
                control_method="position_control",
                completion_bonus=2.,
                action_scale=0.04,
            )
            latent_policy = joblib.load(latent_policy_pkl)["policy"]

            env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy))

            action_noise = OUStrategy(env, sigma=0.2)

            policy = ContinuousMLPPolicy(
                env_spec=env.spec,
                name="Actor",
                hidden_sizes=[64, 32],
                hidden_nonlinearity=tf.nn.relu,
            )

            qf = ContinuousMLPQFunction(env_spec=env,
                                        name="Critic",
                                        hidden_sizes=[64, 32],
                                        hidden_nonlinearity=tf.nn.relu)

            replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                               size_in_transitions=int(1e6),
                                               time_horizon=100)

            algo = DDPG(env,
                        policy=policy,
                        policy_lr=1e-4,
                        qf_lr=1e-3,
                        qf=qf,
                        plot=True,
                        target_update_tau=1e-2,
                        n_epochs=500,
                        n_train_steps=50,
                        discount=0.9,
                        replay_buffer=replay_buffer,
                        min_buffer_size=int(1e3),
                        exploration_strategy=action_noise,
                        policy_optimizer=tf.train.AdamOptimizer,
                        qf_optimizer=tf.train.AdamOptimizer)
            runner.setup(algo, env)
            runner.train(n_epochs=500, plot=False, n_epoch_cycles=10)
Esempio n. 5
0
def run_task(*_):
    with LocalRunner() as runner:
        env = SimpleReacherEnv(
            goal_position=(0.5, 0, 0.15),
            control_method="position_control",
            completion_bonus=2.,
            # action_scale=0.04,
        )

        env = TfEnv(env)

        action_noise = OUStrategy(env, sigma=0.05)

        actor_net = ContinuousMLPPolicy(
            env_spec=env.spec,
            name="Actor",
            hidden_sizes=[200, 100],
            hidden_nonlinearity=tf.nn.relu,)

        critic_net = ContinuousMLPQFunction(
            env_spec=env.spec,
            name="Critic",
            hidden_sizes=[200, 100],
            hidden_nonlinearity=tf.nn.relu)

        replay_buffer = SimpleReplayBuffer(
            env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100)

        ddpg = DDPG(
            env,
            policy=actor_net,
            policy_lr=1e-4,
            qf=critic_net,
            qf_lr=1e-3,
            replay_buffer=replay_buffer,
            target_update_tau=1e-2,
            max_path_length=200,
            n_train_steps=50,
            discount=0.9,
            min_buffer_size=int(1e4),
            exploration_strategy=action_noise,
            policy_optimizer=tf.train.AdamOptimizer,
            qf_optimizer=tf.train.AdamOptimizer)

        runner.setup(ddpg, env)
        runner.train(n_epochs=500, n_epoch_cycles=10, plot=False)
Esempio n. 6
0
 def test_ddpg(self):
     env = TheanoEnv(CartpoleEnv())
     policy = DeterministicMLPPolicy(env.spec)
     qf = ContinuousMLPQFunction(env.spec)
     es = OUStrategy(env.spec)
     algo = DDPG(
         env=env,
         policy=policy,
         qf=qf,
         es=es,
         n_epochs=1,
         epoch_length=100,
         batch_size=32,
         min_pool_size=50,
         replay_pool_size=1000,
         eval_samples=100,
     )
     algo.train()
Esempio n. 7
0
def run_task(*_):
    """
    Wrap DDPG training task in the run_task function.

    :param _:
    :return:
    """
    env = TfEnv(gym.make('InvertedDoublePendulum-v2'))

    action_noise = OUStrategy(env.spec, sigma=0.2)

    policy = ContinuousMLPPolicy(env_spec=env.spec,
                                 hidden_sizes=[64, 64],
                                 hidden_nonlinearity=tf.nn.relu,
                                 output_nonlinearity=tf.nn.tanh)

    qf = ContinuousMLPQFunction(env_spec=env.spec,
                                hidden_sizes=[64, 64],
                                hidden_nonlinearity=tf.nn.relu)

    replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                       size_in_transitions=int(1e6),
                                       time_horizon=100)

    ddpg = DDPG(env,
                policy=policy,
                policy_lr=1e-4,
                qf_lr=1e-3,
                qf=qf,
                replay_buffer=replay_buffer,
                plot=False,
                target_update_tau=1e-2,
                n_epochs=500,
                n_epoch_cycles=20,
                max_path_length=100,
                n_train_steps=50,
                discount=0.9,
                min_buffer_size=int(1e4),
                exploration_strategy=action_noise,
                policy_optimizer=tf.train.AdamOptimizer,
                qf_optimizer=tf.train.AdamOptimizer)

    ddpg.train()
def run_task(*_):
    sess = tf.Session()
    sess.__enter__()
    latent_policy = joblib.load(latent_policy_pkl)["policy"]

    inner_env = SequencePointEnv(completion_bonus=100)
    env = TfEnv(AlmostContinuousEmbeddedPolicyEnv(inner_env, latent_policy))

    action_noise = OUStrategy(env, sigma=0.8)

    actor_net = ContinuousMLPPolicy(env_spec=env,
                                    name="Actor",
                                    hidden_sizes=[64, 64],
                                    hidden_nonlinearity=tf.nn.relu,
                                    output_nonlinearity=tf.nn.tanh)

    critic_net = ContinuousMLPQFunction(env_spec=env,
                                        name="Critic",
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=tf.nn.relu)

    ddpg = DDPG(
        env,
        actor=actor_net,
        actor_lr=1e-4,
        critic_lr=1e-3,
        critic=critic_net,
        plot=False,
        target_update_tau=1e-2,
        n_epochs=500,
        n_epoch_cycles=100,
        n_rollout_steps=50,
        n_train_steps=50,
        discount=0.9,
        replay_buffer_size=int(1e6),
        min_buffer_size=int(1e4),
        exploration_strategy=action_noise,
        actor_optimizer=tf.train.AdamOptimizer,
        critic_optimizer=tf.train.AdamOptimizer,
    )

    ddpg.train(sess=sess)
Esempio n. 9
0
 def test_ddpg(self):
     env = TheanoEnv(CartpoleEnv())
     policy = DeterministicMLPPolicy(env.spec)
     qf = ContinuousMLPQFunction(env.spec)
     es = OUStrategy(env.spec)
     replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                        size_in_transitions=int(1000),
                                        time_horizon=100)
     algo = DDPG(
         env=env,
         policy=policy,
         qf=qf,
         es=es,
         pool=replay_buffer,
         n_epochs=1,
         epoch_length=100,
         batch_size=32,
         min_pool_size=50,
         eval_samples=100,
     )
     algo.train()
Esempio n. 10
0
    def test_ddpg_pendulum(self):
        """Test PPO with Pendulum environment."""
        logger.reset()
        with LocalRunner(self.sess) as runner:
            env = TfEnv(gym.make('InvertedDoublePendulum-v2'))
            action_noise = OUStrategy(env.spec, sigma=0.2)
            policy = ContinuousMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=[64, 64],
                                         hidden_nonlinearity=tf.nn.relu,
                                         output_nonlinearity=tf.nn.tanh)
            qf = ContinuousMLPQFunction(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=tf.nn.relu)
            replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                               size_in_transitions=int(1e6),
                                               time_horizon=100)
            algo = DDPG(
                env,
                policy=policy,
                policy_lr=1e-4,
                qf_lr=1e-3,
                qf=qf,
                replay_buffer=replay_buffer,
                target_update_tau=1e-2,
                n_train_steps=50,
                discount=0.9,
                min_buffer_size=int(1e4),
                exploration_strategy=action_noise,
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10,
                                        n_epoch_cycles=20,
                                        batch_size=100)
            assert last_avg_ret > 60

            env.close()
Esempio n. 11
0
def run_garage(env, seed, log_dir):
    """
    Create garage model and training.

    Replace the ddpg with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    """
    deterministic.set_seed(seed)

    with LocalRunner() as runner:
        env = TfEnv(env)
        # Set up params for ddpg
        action_noise = OUStrategy(env.spec, sigma=params["sigma"])

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=params["policy_hidden_sizes"],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh)

        qf = ContinuousMLPQFunction(env_spec=env.spec,
                                    hidden_sizes=params["qf_hidden_sizes"],
                                    hidden_nonlinearity=tf.nn.relu)

        replay_buffer = SimpleReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params["replay_buffer_size"],
            time_horizon=params["n_rollout_steps"])

        ddpg = DDPG(env,
                    policy=policy,
                    qf=qf,
                    replay_buffer=replay_buffer,
                    policy_lr=params["policy_lr"],
                    qf_lr=params["qf_lr"],
                    target_update_tau=params["tau"],
                    n_train_steps=params["n_train_steps"],
                    discount=params["discount"],
                    min_buffer_size=int(1e4),
                    exploration_strategy=action_noise,
                    policy_optimizer=tf.train.AdamOptimizer,
                    qf_optimizer=tf.train.AdamOptimizer)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, "progress.csv")
        tensorboard_log_dir = osp.join(log_dir)
        garage_logger.add_tabular_output(tabular_log_file)
        garage_logger.set_tensorboard_dir(tensorboard_log_dir)

        runner.setup(ddpg, env)
        runner.train(n_epochs=params['n_epochs'],
                     n_epoch_cycles=params['n_epoch_cycles'],
                     batch_size=params["n_rollout_steps"])

        garage_logger.remove_tabular_output(tabular_log_file)

        return tabular_log_file
Esempio n. 12
0
def run_garage(env, seed, log_dir):
    """
    Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trail.
    :param log_dir: Log dir path.
    :return:
    """
    ext.set_seed(seed)

    with tf.Graph().as_default():
        env = TfEnv(env)

        action_noise = OUStrategy(env.spec, sigma=params["sigma"])

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            name="Policy",
            hidden_sizes=params["policy_hidden_sizes"],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
            input_include_goal=True,
        )

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            name="QFunction",
            hidden_sizes=params["qf_hidden_sizes"],
            hidden_nonlinearity=tf.nn.relu,
            input_include_goal=True,
        )

        replay_buffer = HerReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params["replay_buffer_size"],
            time_horizon=params["n_rollout_steps"],
            replay_k=0.4,
            reward_fun=env.compute_reward,
        )

        algo = DDPG(
            env,
            policy=policy,
            qf=qf,
            replay_buffer=replay_buffer,
            policy_lr=params["policy_lr"],
            qf_lr=params["qf_lr"],
            plot=False,
            target_update_tau=params["tau"],
            n_epochs=params["n_epochs"],
            n_epoch_cycles=params["n_epoch_cycles"],
            max_path_length=params["n_rollout_steps"],
            n_train_steps=params["n_train_steps"],
            discount=params["discount"],
            exploration_strategy=action_noise,
            policy_optimizer=tf.train.AdamOptimizer,
            qf_optimizer=tf.train.AdamOptimizer,
            buffer_batch_size=256,
            input_include_goal=True,
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, "progress.csv")
        garage_logger.add_tabular_output(tabular_log_file)
        garage_logger.set_tensorboard_dir(log_dir)

        algo.train()

        garage_logger.remove_tabular_output(tabular_log_file)

        return tabular_log_file