def shane_ddpg_launcher(variant):
	from rllab.exploration_strategies.gaussian_strategy import GaussianStrategy
	from sandbox.rocky.tf.algos.ddpg import DDPG as ShaneDDPG
	from sandbox.rocky.tf.envs.base import TfEnv
	from sandbox.rocky.tf.policies.deterministic_mlp_policy import (
		DeterministicMLPPolicy
	)
	from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import (
		ContinuousMLPQFunction
	)
	from railrl.launchers.launcher_util import get_env_settings
	env_settings = get_env_settings(**variant['env_params'])
	env = TfEnv(env_settings['env'])
	es = GaussianStrategy(env.spec)

	policy = DeterministicMLPPolicy(
		name="init_policy",
		env_spec=env.spec,
		**variant['policy_params']
	)
	qf = ContinuousMLPQFunction(
		name="qf",
		env_spec=env.spec,
		**variant['qf_params']
	)

	algorithm = ShaneDDPG(
		env,
		policy,
		qf,
		es,
		**variant['algo_params']
	)
	algorithm.train()
def main():
    stub(globals())

    for seed in range(3):
        env = TfEnv(HalfCheetahEnv())
        es = GaussianStrategy(env.spec)
        policy = DeterministicMLPPolicy(
            name="init_policy",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
        )
        qf = ContinuousMLPQFunction(name="qf",
                                    env_spec=env.spec,
                                    hidden_sizes=(100, 100))
        ddpg_params = dict(
            batch_size=4,
            n_epochs=100,
            epoch_length=50,
            eval_samples=50,
            max_path_length=10,
            min_pool_size=5,
        )
        algorithm = DDPG(env, policy, qf, es, **ddpg_params)

        for _ in range(3):
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix="check-rllab-ddpg-seed",
                seed=seed,
                variant={"seed": seed},
            )
Ejemplo n.º 3
0
def main():
    stub(globals())
    env = TfEnv(HalfCheetahEnv())
    for seed in range(3):
        ddpg_params = dict(
            batch_size=128,
            n_epochs=100,
            epoch_length=10000,
            eval_samples=10000,
            discount=0.99,
            policy_learning_rate=1e-4,
            qf_learning_rate=1e-3,
            soft_target_tau=0.01,
            replay_pool_size=1000000,
            min_pool_size=256,
            scale_reward=1.0,
            max_path_length=1000,
            qf_weight_decay=0.0,
        )
        vitchyr_es = OUStrategy(env_spec=env.spec)
        vitchyr_qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        vitchyr_policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        vitchyr_ddpg = DDPG(env, vitchyr_es, vitchyr_policy, vitchyr_qf,
                            **ddpg_params)

        shane_es = GaussianStrategy(env.spec)
        shane_policy = DeterministicMLPPolicy(
            name="init_policy",
            env_spec=env.spec,
            hidden_sizes=(100, 100),
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
        )
        shane_qf = ContinuousMLPQFunction(name="qf",
                                          env_spec=env.spec,
                                          hidden_sizes=(100, 100))
        shane_ddpg = ShaneDDPG(env, shane_policy, shane_qf, shane_es,
                               **ddpg_params)

        names_and_algos = [
            ("Vitchyr_DDPG", vitchyr_ddpg),
            ("Shane_DDPG", shane_ddpg),
        ]
        for name, algorithm in names_and_algos:
            env.reset()
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix="ddpg-comparison-cheetah",
                seed=seed,
            )
Ejemplo n.º 4
0
    def __init__(self, env, args):
        self.args = args
        # Parallel setup
        parallel_sampler.initialize(n_parallel=args.n_parallel)
        if args.seed is not None:
            set_seed(args.seed)
            parallel_sampler.set_seed(args.seed)

        env, policy = rllab_envpolicy_parser(env, args)

        if not args.algo == 'thddpg':
            # Baseline
            if args.baseline_type == 'linear':
                baseline = LinearFeatureBaseline(env_spec=env.spec)
            elif args.baseline_type == 'zero':
                baseline = ZeroBaseline(env_spec=env.spec)
            else:
                raise NotImplementedError(args.baseline_type)

        # Logger
        default_log_dir = config.LOG_DIR
        if args.log_dir is None:
            log_dir = osp.join(default_log_dir, args.exp_name)
        else:
            log_dir = args.log_dir

        tabular_log_file = osp.join(log_dir, args.tabular_log_file)
        text_log_file = osp.join(log_dir, args.text_log_file)
        params_log_file = osp.join(log_dir, args.params_log_file)

        logger.log_parameters_lite(params_log_file, args)
        logger.add_text_output(text_log_file)
        logger.add_tabular_output(tabular_log_file)
        prev_snapshot_dir = logger.get_snapshot_dir()
        prev_mode = logger.get_snapshot_mode()
        logger.set_snapshot_dir(log_dir)
        logger.set_snapshot_mode(args.snapshot_mode)
        logger.set_log_tabular_only(args.log_tabular_only)
        logger.push_prefix("[%s] " % args.exp_name)

        if args.algo == 'tftrpo':
            self.algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=args.batch_size,
                max_path_length=args.max_path_length,
                n_itr=args.n_iter,
                discount=args.discount,
                gae_lambda=args.gae_lambda,
                step_size=args.step_size,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(
                        base_eps=1e-5)) if args.recurrent else None,
                mode=args.control)
        elif args.algo == 'thddpg':
            qfunc = thContinuousMLPQFunction(env_spec=env.spec)
            if args.exp_strategy == 'ou':
                es = OUStrategy(env_spec=env.spec)
            elif args.exp_strategy == 'gauss':
                es = GaussianStrategy(env_spec=env.spec)
            else:
                raise NotImplementedError()

            self.algo = thDDPG(env=env,
                               policy=policy,
                               qf=qfunc,
                               es=es,
                               batch_size=args.batch_size,
                               max_path_length=args.max_path_length,
                               epoch_length=args.epoch_length,
                               min_pool_size=args.min_pool_size,
                               replay_pool_size=args.replay_pool_size,
                               n_epochs=args.n_iter,
                               discount=args.discount,
                               scale_reward=0.01,
                               qf_learning_rate=args.qfunc_lr,
                               policy_learning_rate=args.policy_lr,
                               eval_samples=args.eval_samples,
                               mode=args.control)
Ejemplo n.º 5
0
    def setup(self, env, policy, start_itr):

        if not self.args.algo == 'thddpg':
            # Baseline
            if self.args.baseline_type == 'linear':
                baseline = LinearFeatureBaseline(env_spec=env.spec)
            elif self.args.baseline_type == 'zero':
                baseline = ZeroBaseline(env_spec=env.spec)
            else:
                raise NotImplementedError(self.args.baseline_type)

            if self.args.control == 'concurrent':
                baseline = [baseline for _ in range(len(env.agents))]
        # Logger
        default_log_dir = config.LOG_DIR
        if self.args.log_dir is None:
            log_dir = osp.join(default_log_dir, self.args.exp_name)
        else:
            log_dir = self.args.log_dir

        tabular_log_file = osp.join(log_dir, self.args.tabular_log_file)
        text_log_file = osp.join(log_dir, self.args.text_log_file)
        params_log_file = osp.join(log_dir, self.args.params_log_file)

        logger.log_parameters_lite(params_log_file, self.args)
        logger.add_text_output(text_log_file)
        logger.add_tabular_output(tabular_log_file)
        prev_snapshot_dir = logger.get_snapshot_dir()
        prev_mode = logger.get_snapshot_mode()
        logger.set_snapshot_dir(log_dir)
        logger.set_snapshot_mode(self.args.snapshot_mode)
        logger.set_log_tabular_only(self.args.log_tabular_only)
        logger.push_prefix("[%s] " % self.args.exp_name)

        if self.args.algo == 'tftrpo':
            algo = MATRPO(
                env=env,
                policy_or_policies=policy,
                baseline_or_baselines=baseline,
                batch_size=self.args.batch_size,
                start_itr=start_itr,
                max_path_length=self.args.max_path_length,
                n_itr=self.args.n_iter,
                discount=self.args.discount,
                gae_lambda=self.args.gae_lambda,
                step_size=self.args.step_size,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(
                        base_eps=1e-5)) if self.args.recurrent else None,
                ma_mode=self.args.control)
        elif self.args.algo == 'thddpg':
            qfunc = thContinuousMLPQFunction(env_spec=env.spec)
            if self.args.exp_strategy == 'ou':
                es = OUStrategy(env_spec=env.spec)
            elif self.args.exp_strategy == 'gauss':
                es = GaussianStrategy(env_spec=env.spec)
            else:
                raise NotImplementedError()

            algo = thDDPG(env=env,
                          policy=policy,
                          qf=qfunc,
                          es=es,
                          batch_size=self.args.batch_size,
                          max_path_length=self.args.max_path_length,
                          epoch_length=self.args.epoch_length,
                          min_pool_size=self.args.min_pool_size,
                          replay_pool_size=self.args.replay_pool_size,
                          n_epochs=self.args.n_iter,
                          discount=self.args.discount,
                          scale_reward=0.01,
                          qf_learning_rate=self.args.qfunc_lr,
                          policy_learning_rate=self.args.policy_lr,
                          eval_samples=self.args.eval_samples,
                          mode=self.args.control)
        return algo
Ejemplo n.º 6
0
)

vg = instrument.VariantGenerator()
vg.add("scale_reward", [0.01])#, 0.001, 0.1])
vg.add("policy_learning_rate", [1e-4])#, 1e-3, 1e-5])
vg.add("qf_learning_rate", [1e-3]) #, 1e-3, 1e-4])
vg.add("decay_period", [1E+6, 1E+5, 1E+4, 1E+3, 1E+7, 1E+8, 1E+9, 1E+10])

variants = vg.variants()
num = eval(sys.argv[1])

print "#Experiments number:", num
variant = variants[num]

# es = OUStrategy(env_spec=env.spec, theta=0.15, sigma=0.3)
es = GaussianStrategy(env_spec=env.spec, max_sigma=1.0, min_sigma=0.1, decay_period=variant["decay_period"])

algo = DDPG(
    env=env,
    policy=policy,
    es=es,
    qf=qf,
    batch_size=35,
    max_path_length=100,
    epoch_length=5000,
    min_pool_size=10000,
    n_epochs=100,
    discount=0.99,
    scale_reward=variant["scale_reward"],
    soft_target_tau=1e-3,
    qf_learning_rate=variant["qf_learning_rate"],