Esempio n. 1
0
def run_task(v):
    env = TheanoEnv(normalize(CartpoleEnv()))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers,
        # each with 32 hidden units.
        hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=v["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable
        # plotting
        plot=True,
    )
    algo.train()
Esempio n. 2
0
    def test_dm_control_tf_policy(self):
        task = ALL_TASKS[0]

        env = TfEnv(DmControlEnv.from_suite(*task))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(32, 32),
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=10,
            max_path_length=5,
            n_itr=1,
            discount=0.99,
            step_size=0.01,
        )

        runner = LocalRunner(self.sess)
        runner.setup(algo, env)
        runner.train(n_epochs=1, batch_size=10)

        env.close()
Esempio n. 3
0
def run_task(*_):
    with LocalRunner() as runner:
        env = TfEnv(env_name="CartPole-v1")

        policy = CategoricalLSTMPolicy(
            name="policy",
            env_spec=env.spec,
            lstm_layer_cls=L.TfBasicLSTMLayer,
            # gru_layer_cls=L.GRULayer,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env=env,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    max_kl_step=0.01,
                    optimizer=ConjugateGradientOptimizer,
                    optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(
                        base_eps=1e-5)))

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=4000)
Esempio n. 4
0
def run_task(*_):
    env_name = "HumanoidStandup-v2"
    hidden_sizes = (100, 50, 25)
    env = TheanoEnv(normalize(gym.make(env_name)))
    print(env.spec.observation_space, env.spec.action_space)
    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes)
    backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes)
    mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes)
    pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes)
    neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes)

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = CAPG(
        env=env,
        policy=policy,
        backup_policy=backup_policy,
        mix_policy=mix_policy,
        pos_eps_policy=pos_eps_policy,
        neg_eps_policy=neg_eps_policy,
        n_timestep=5e6,
        learning_rate=0.05,
        batch_size=5000,
        minibatch_size=500,
        n_sub_itr=10,
        baseline=baseline,
        max_path_length=500,
        discount=0.99,
        decay_learing_rate=True,
        log_dir='./logs/' + env_name,
    )
    algo.train()
Esempio n. 5
0
def run_task(v):
    """
    We wrap the main training loop in the run_task function so that
    run_experiment can easily execute variants of the experiment on different
    machines
    """
    env = TfEnv(env_name="CartPole-v1")

    policy = CategoricalMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers,
        # each with 32 hidden units.
        hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=v["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable
        # plotting
        plot=True,
    )
    algo.train()
def run_task(*_):
    """Run task function."""
    initial_goal = np.array([0.6, -0.1, 0.30])

    rospy.init_node('trpo_real_sawyer_reacher_exp', anonymous=True)

    env = TheanoEnv(
        ReacherEnv(
            initial_goal,
            initial_joint_pos=INITIAL_ROBOT_JOINT_POS,
            simulated=False,
            robot_control_mode='position'))

    rospy.on_shutdown(env.shutdown)

    env.initialize()

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=100,
        discount=0.99,
        step_size=0.01,
        plot=False,
        force_batch_sampler=True,
    )
    algo.train()
Esempio n. 7
0
    def test_dm_control_tf_policy(self):
        task = ALL_TASKS[0]

        with self.graph.as_default():
            env = TfEnv(DmControlEnv.from_suite(*task))

            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(32, 32),
            )

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=10,
                max_path_length=5,
                n_itr=1,
                discount=0.99,
                step_size=0.01,
            )
            algo.train()
            env.close()
Esempio n. 8
0
def run_task(*_):
    with LocalRunner() as runner:
        env = TfEnv(normalize(PointEnv(goal=(-1, 0))))

        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            max_kl_step=0.01,
        )

        batch_size = 4000
        max_path_length = 100
        n_envs = batch_size // max_path_length

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=batch_size, plot=False)
Esempio n. 9
0
def run_task(*_):
    """Run task function."""
    initial_goal = np.array([0.6, -0.1, 0.40])

    # Initialize moveit_commander
    moveit_commander.roscpp_initialize(sys.argv)

    rospy.init_node('trpo_sim_sawyer_reacher_exp', anonymous=True)

    env = ReacherEnv(initial_goal,
                     initial_joint_pos=INITIAL_ROBOT_JOINT_POS,
                     simulated=True)

    rospy.on_shutdown(env.shutdown)

    env.initialize()

    policy = GaussianMLPPolicy(env_spec=spec(env), hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=spec(env))

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=100,
        discount=0.99,
        step_size=0.01,
        plot=False,
        force_batch_sampler=True,
    )
    algo.train()
Esempio n. 10
0
def run_task(*_):
    with LocalRunner() as runner:
        env = TfEnv(
            normalize(
                OneHotMultiTaskEnv(task_env_cls=PointEnv,
                                   task_args=TASK_ARGS,
                                   task_kwargs=TASK_KWARGS)))

        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            max_kl_step=0.01,
        )

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=4000)
Esempio n. 11
0
    def test_reps_cartpole(self):
        """Test REPS with gym Cartpole environment."""
        with LocalRunner(self.sess) as runner:
            logger.reset()
            env = TfEnv(gym.make("CartPole-v0"))

            policy = CategoricalMLPPolicy(env_spec=env.spec,
                                          hidden_sizes=[32, 32])

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = REPS(env=env,
                        policy=policy,
                        baseline=baseline,
                        batch_size=4000,
                        max_path_length=100,
                        n_itr=10,
                        discount=0.99,
                        max_kl_step=1e6)

            runner.setup(algo, env)

            last_avg_ret = runner.train(n_epochs=10, batch_size=4000)
            assert last_avg_ret > 5

            env.close()
Esempio n. 12
0
    def test_vpg_cartpole(self):
        """Test VPG with CartPole-v1 environment."""
        with LocalRunner(self.sess) as runner:
            env = TfEnv(env_name="CartPole-v1")

            policy = CategoricalMLPPolicy(
                name="policy", env_spec=env.spec, hidden_sizes=(32, 32))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = VPG(
                env=env,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                optimizer_args=dict(
                    tf_optimizer_args=dict(learning_rate=0.01, )))

            runner.setup(algo, env)

            last_avg_ret = runner.train(n_epochs=10, batch_size=10000)
            assert last_avg_ret > 90

            env.close()
Esempio n. 13
0
def run_task(*_):

    env = TfEnv(
        normalize(
            MinibotEnv(
                use_maps=[0, 1],  # 'all',  # [0,1]
                discretized=True)))

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=5000,
                max_path_length=100,
                n_itr=15,
                discount=0.99,
                step_size=0.01,
                plot=plot,
                pause_for_plot=False)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as session:
        algo.train(sess=session)
Esempio n. 14
0
def run_task(vv):
    with LocalRunner() as runner:
        env = TfEnv(normalize(gym.make('HalfCheetah-v1')))

        policy = GaussianMLPPolicy(env_spec=env.spec,
                                   hidden_sizes=(32, 32),
                                   name="policy")

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            step_size=vv["step_size"],
        )

        runner.setup(algo=algo, env=env)

        runner.train(
            n_epochs=40,
            batch_size=4000,
            # Uncomment to enable plotting
            # plot=True
        )
def run_task(*_):
    env = TfEnv(
        normalize(
            OneHotMultiTaskEnv(task_env_cls=PR2ArmClockEnv,
                               task_args=TASK_ARGS,
                               task_kwargs=TASK_KWARGS)))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=400000000,
        discount=0.99,
        step_size=0.01,
        plot=True,
    )
    algo.train()
Esempio n. 16
0
    def test_batch_sampler(self):
        max_cpus = 8
        with LocalRunner(max_cpus=max_cpus) as runner:
            env = TfEnv(env_name='CartPole-v1')

            policy = CategoricalMLPPolicy(name="policy",
                                          env_spec=env.spec,
                                          hidden_sizes=(32, 32))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = VPG(env=env,
                       policy=policy,
                       baseline=baseline,
                       max_path_length=1,
                       whole_paths=True,
                       discount=0.99)

            runner.setup(algo,
                         env,
                         sampler_cls=BatchSampler,
                         sampler_args={'n_envs': max_cpus})

            try:
                runner.initialize_tf_vars()
            except BaseException:
                raise self.failureException(
                    "LocalRunner should be able to initialize tf variables.")

            runner.start_worker()

            paths = runner.sampler.obtain_samples(0, 8)
            self.assertGreaterEqual(
                len(paths), max_cpus, "BatchSampler should sample more than "
                "max_cpus=%d trajectories" % max_cpus)
Esempio n. 17
0
    def test_categorical_policies(self, policy_cls):
        with LocalRunner(self.sess) as runner:
            env = TfEnv(normalize(gym.make("CartPole-v0")))

            policy = policy_cls(name="policy", env_spec=env.spec)

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                step_size=0.01,
                plot=True,
                optimizer=ConjugateGradientOptimizer,
                optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(
                    base_eps=1e-5)),
            )

            runner.setup(algo, env)
            runner.train(n_epochs=1, batch_size=4000)

            env.close()
Esempio n. 18
0
def run_task(v):
    """
    We wrap the main training loop in the run_task function so that
    run_experiment can easily execute variants of the experiment on different
    machines
    """
    with LocalRunner() as runner:
        env = TfEnv(env_name="CartPole-v1")

        policy = CategoricalMLPPolicy(
            env_spec=env.spec,
            # The neural network policy should have two hidden layers,
            # each with 32 hidden units.
            hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            step_size=v["step_size"],
        )

        runner.setup(algo=algo, env=env)

        runner.train(
            n_epochs=40,
            batch_size=4000,
            # Uncomment to enable plotting
            # plot=True
        )
Esempio n. 19
0
def run_task(*_):
    env = normalize(
        DmControlEnv(
            domain_name='cartpole', task_name='balance',
            visualize_reward=True))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(32, 32),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=400,
        discount=0.99,
        step_size=0.01,
        plot=True,
    )
    algo.train()
Esempio n. 20
0
def run_task(*_):
    # Please note that different environments with different action spaces may
    # require different policies. For example with a Discrete action space, a
    # CategoricalMLPPolicy works, but for a Box action space may need to use
    # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example)
    env = TheanoEnv(normalize(gym.make("CartPole-v0")))

    policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable
        # plotting
        plot=True,
    )
    algo.train()
Esempio n. 21
0
def run_task(*_):
    env = normalize(CartpoleEnv())

    policy = DummyPolicy(env_spec=env)

    baseline = LinearFeatureBaseline(env_spec=env)
    algo = InstrumentedNOP(env=env,
                           policy=policy,
                           baseline=baseline,
                           batch_size=4000,
                           max_path_length=100,
                           n_itr=4,
                           discount=0.99,
                           step_size=0.01,
                           plot=True)
    algo.train()
Esempio n. 22
0
def run_task(*_):
    with LocalRunner() as runner:
        env = TfEnv(gym.make("CartPole-v0"))

        policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32])

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = REPS(env=env,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99)

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=4000, plot=False)
def run_task(*_):
    env = TheanoEnv(normalize(CartpoleEnv()))

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = InstrumentedTRPO(env=env,
                            policy=policy,
                            baseline=baseline,
                            batch_size=4000,
                            max_path_length=100,
                            n_itr=4,
                            discount=0.99,
                            step_size=0.01,
                            plot=True)
    algo.train()
Esempio n. 24
0
def run_task(*_):
    with LocalRunner() as runner:
        env = TfEnv(gym.make('Swimmer-v2'))

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            max_path_length=500,
            discount=0.99,
            step_size=0.01)

        runner.setup(algo, env)
        runner.train(n_epochs=40, batch_size=4000)
Esempio n. 25
0
def run_task(*_):
    """Wrap REPS training task in the run_task function."""
    env = TfEnv(gym.make("CartPole-v0"))

    policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = REPS(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=4000,
                max_path_length=100,
                n_itr=100,
                discount=0.99,
                plot=False)

    algo.train()
Esempio n. 26
0
def run_task(*_):
    env = TheanoEnv(normalize(CartpoleEnv()))

    policy = GaussianGRUPolicy(env_spec=env.spec, )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=4000,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                step_size=0.01,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))
    algo.train()
Esempio n. 27
0
def run_task(*_):
    """Wrap ERWR training task in the run_task function."""
    env = TfEnv(normalize(CartpoleEnv()))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ERWR(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=10000,
                max_path_length=100,
                n_itr=40,
                discount=0.99)
    algo.train()
Esempio n. 28
0
def run_pick_and_place(*_):
    initial_goal = np.array([0.6, -0.1, 0.80])
    env = TheanoEnv(PickAndPlaceEnv(initial_goal))
    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        batch_size=4000,
        max_path_length=2000,
        baseline=baseline,
        n_itr=1000,
        discount=0.99,
        step_size=0.01,
        plot=True,
        force_batch_sampler=True,
    )
    algo.train()
Esempio n. 29
0
def run_task(*_):
    """Wrap ERWR training task in the run_task function."""
    env = TfEnv(env_name="CartPole-v1")

    policy = CategoricalMLPPolicy(name="policy",
                                  env_spec=env.spec,
                                  hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = ERWR(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=10000,
                max_path_length=100,
                n_itr=100,
                plot=True,
                discount=0.99)
    algo.train()
Esempio n. 30
0
def run(*_):
    """Stub method for running trpo."""
    env = TheanoEnv(
        ReacherEnv(control_method='position_control', sparse_reward=False))
    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        batch_size=4000,
        max_path_length=100,
        baseline=baseline,
        n_itr=2500,
        discount=0.99,
        step_size=0.01,
        plot=True,
        force_batch_sampler=True,
    )
    algo.train()