def shane_ddpg_launcher(variant):
	from rllab.exploration_strategies.gaussian_strategy import GaussianStrategy
	from sandbox.rocky.tf.algos.ddpg import DDPG as ShaneDDPG
	from sandbox.rocky.tf.envs.base import TfEnv
	from sandbox.rocky.tf.policies.deterministic_mlp_policy import (
		DeterministicMLPPolicy
	)
	from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import (
		ContinuousMLPQFunction
	)
	from railrl.launchers.launcher_util import get_env_settings
	env_settings = get_env_settings(**variant['env_params'])
	env = TfEnv(env_settings['env'])
	es = GaussianStrategy(env.spec)

	policy = DeterministicMLPPolicy(
		name="init_policy",
		env_spec=env.spec,
		**variant['policy_params']
	)
	qf = ContinuousMLPQFunction(
		name="qf",
		env_spec=env.spec,
		**variant['qf_params']
	)

	algorithm = ShaneDDPG(
		env,
		policy,
		qf,
		es,
		**variant['algo_params']
	)
	algorithm.train()
def main():
    stub(globals())

    for seed in range(3):
        env = TfEnv(HalfCheetahEnv())
        es = GaussianStrategy(env.spec)
        policy = DeterministicMLPPolicy(
            name="init_policy",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
        )
        qf = ContinuousMLPQFunction(name="qf",
                                    env_spec=env.spec,
                                    hidden_sizes=(100, 100))
        ddpg_params = dict(
            batch_size=4,
            n_epochs=100,
            epoch_length=50,
            eval_samples=50,
            max_path_length=10,
            min_pool_size=5,
        )
        algorithm = DDPG(env, policy, qf, es, **ddpg_params)

        for _ in range(3):
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix="check-rllab-ddpg-seed",
                seed=seed,
                variant={"seed": seed},
            )
Exemple #3
0
def main():
    stub(globals())
    env = TfEnv(HalfCheetahEnv())
    for seed in range(3):
        ddpg_params = dict(
            batch_size=128,
            n_epochs=100,
            epoch_length=10000,
            eval_samples=10000,
            discount=0.99,
            policy_learning_rate=1e-4,
            qf_learning_rate=1e-3,
            soft_target_tau=0.01,
            replay_pool_size=1000000,
            min_pool_size=256,
            scale_reward=1.0,
            max_path_length=1000,
            qf_weight_decay=0.0,
        )
        vitchyr_es = OUStrategy(env_spec=env.spec)
        vitchyr_qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        vitchyr_policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        vitchyr_ddpg = DDPG(env, vitchyr_es, vitchyr_policy, vitchyr_qf,
                            **ddpg_params)

        shane_es = GaussianStrategy(env.spec)
        shane_policy = DeterministicMLPPolicy(
            name="init_policy",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
        )
        shane_qf = ContinuousMLPQFunction(name="qf",
                                          env_spec=env.spec,
                                          hidden_sizes=(100, 100))
        shane_ddpg = ShaneDDPG(env, shane_policy, shane_qf, shane_es,
                               **ddpg_params)

        names_and_algos = [
            ("Vitchyr_DDPG", vitchyr_ddpg),
            ("Shane_DDPG", shane_ddpg),
        ]
        for name, algorithm in names_and_algos:
            env.reset()
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix="ddpg-comparison-cheetah",
                seed=seed,
            )
Exemple #4
0
def get_qf(env, algo_name, qf_hid_size, qf_hidden_nonlinearity, **kwargs):
    qf = None
    if algo_name in ['ddpg', 'qprop', 'qvpg']:
        if qf_hidden_nonlinearity == 'relu':
            hidden_nonlinearity = tf.nn.relu
        elif qf_hidden_nonlinearity == 'tanh':
            hidden_nonlinearity = tf.nn.tanh
        else: raise NotImplementedError(qf_hidden_nonlinearity)
        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            #hidden_sizes=(100,100),
            hidden_sizes=(qf_hid_size, qf_hid_size),
            hidden_nonlinearity=hidden_nonlinearity,
        )
    return qf
def main():
    stub(globals())
    ddpg_params = dict(
        batch_size=64,
        n_epochs=2000,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        soft_target_tau=0.001,
        replay_pool_size=1000000,
        min_pool_size=1000,
        scale_reward=0.1,
    )
    env = TfEnv(HalfCheetahEnv())
    es = OUStrategy(env_spec=env.spec)

    policy = DeterministicMLPPolicy(
        name="init_policy",
        env_spec=env.spec,
        hidden_sizes=(100, 100),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.tanh,
    )
    qf = ContinuousMLPQFunction(
        name="qf",
        env_spec=env.spec,
        hidden_sizes=(100, 100),
        bn=False,
    )

    algorithm = DDPG(
        env,
        policy,
        qf,
        es,
        **ddpg_params
    )

    run_experiment_lite(
        algorithm.train(),
        n_parallel=1,
        snapshot_mode="last",
        exp_prefix="ddpg-shane-half-cheetah-script",
        seed=1,
        variant=ddpg_params,
    )
Exemple #6
0
from rllab.misc.instrument import stub, run_experiment_lite
from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
from sandbox.rocky.tf.baselines.q_baseline import QfunctionBaseline

stub(globals())

env = TfEnv(normalize(CartpoleEnv()))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32)
)

qf = ContinuousMLPQFunction(env_spec=env.spec)

baseline = LinearFeatureBaseline(env_spec=env.spec)

qf_baseline = QfunctionBaseline(env_spec=env.spec,
    policy=policy, qf=qf)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    qf_baseline=qf_baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=40,
    discount=0.99,
Exemple #7
0
for r in range(learning_rate_size):

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        name="policy",
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(100, 50, 25),
        hidden_nonlinearity=tf.nn.relu,
    )

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(
        env_spec=env.spec,
        hidden_sizes=(100, 50, 25),
        hidden_nonlinearity=tf.nn.relu,
    )

    for e in range(num_experiments):

        algo = ddpg_class(
            env=env,
            policy=policy,
            es=es,
            qf=qf,
            batch_size=32,
            max_path_length=env.horizon,
            epoch_length=1000,
            min_pool_size=10000,
            n_epochs=args.num_epochs,

for l in range(layer_size): 

        policy = DeterministicMLPPolicy(
            env_spec=env.spec,
            name="policy",
            # The neural network policy should have two hidden layers, each with 32 hidden units.
            hidden_sizes=(layer_1[l], layer_2[l]),
            hidden_nonlinearity=tf.nn.relu,
        )

        es = OUStrategy(env_spec=env.spec)

        qf = ContinuousMLPQFunction(env_spec=env.spec,
                                    hidden_sizes=(layer_1[l], layer_2[l]),
                                    hidden_nonlinearity=tf.nn.relu,)


        for e in range(num_experiments):

            algo = ddpg_class(
                env=env,
                policy=policy,
                es=es,
                qf=qf,
                batch_size=32,
                max_path_length=env.horizon,
                epoch_length=1000,
                min_pool_size=10000,
                n_epochs=args.num_epochs,
Exemple #9
0
def get_qf(env, info, algo_name, qf_hidden_sizes, qf_hidden_nonlinearity,
           **kwargs):
    qf = None
    qf_class = None
    hidden_sizes = get_hidden_sizes(qf_hidden_sizes)
    hidden_nonlinearity = get_nonlinearity(qf_hidden_nonlinearity)
    extra_kwargs = dict()
    if algo_name in [
            'ddpg',
            'trpg',
            'trpgoff',
            'qprop',
            'mqprop',
            'nuqprop',
            'nuqfqprop',
            'qfqprop',
            'actrpo',
            'acqftrpo',
            'qvpg',
            'dspg',
            'dspgoff',
    ]:
        if info['is_action_discrete']:
            qf = DiscreteMLPQFunction(
                env_spec=env.spec,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
            )
            qf_class = 'DiscreteMLPQFunction'
        else:
            if algo_name in [
                    'trpg',
                    'trpgoff',
                    'dspg',
                    'dspgoff',
                    'acqftrpo',
                    'qfqprop',
                    'nuqfqprop',
            ]:
                extra_kwargs['eqf_use_full_qf'] = True
            elif algo_name == 'mqprop':
                extra_kwargs['mqprop'] = True
            qf = ContinuousMLPQFunction(
                env_spec=env.spec,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                **extra_kwargs,
            )
            qf_class = 'ContinuousMLPQFunction'
    elif algo_name in [
            'nafqprop',
    ]:
        assert not info['is_action_discrete']
        qf = NAFMLPQFunction(
            env_spec=env.spec,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
        )
        qf_class = 'NAFMLPQFunction'
    elif algo_name in [
            'dqn',
    ]:
        if info['is_action_discrete']:
            qf = DeterministicDiscreteMLPQFunction(
                env_spec=env.spec,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
            )
            qf_class = 'DeterministicDiscreteMLPQFunction'
        else:
            qf = DeterministicNAFMLPQFunction(
                env_spec=env.spec,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
            )
            qf_class = 'DeterministicNAFMLPQFunction'
    elif algo_name in ['dsqn']:
        assert info['is_action_discrete']
        qf = StochasticDiscreteMLPQFunction(
            env_spec=env.spec,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
        )
        qf_class = 'StochasticDiscreteMLPQFunction'
    print('[get_qf] Instantiating %s, with sizes=%s, hidden_nonlinearity=%s.' %
          (qf_class, str(hidden_sizes), qf_hidden_nonlinearity))
    return qf
Exemple #10
0
activation_map = {"relu": tf.nn.relu, "tanh": tf.nn.tanh, "leaky_relu": lrelu}

policy = DeterministicMLPPolicy(
    env_spec=env.spec,
    name="policy",
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=args.policy_size,
    hidden_nonlinearity=activation_map[args.policy_activation],
)

es = OUStrategy(env_spec=env.spec)

qf = ContinuousMLPQFunction(
    env_spec=env.spec,
    hidden_nonlinearity=activation_map[args.vf_activation],
    hidden_sizes=args.vf_size,
)

algo = DDPG(env=env,
            policy=policy,
            es=es,
            qf=qf,
            batch_size=128,
            max_path_length=env.horizon,
            epoch_length=1000,
            min_pool_size=10000,
            n_epochs=args.num_epochs,
            discount=0.995,
            scale_reward=args.reward_scale,
            qf_learning_rate=1e-3,