コード例 #1
0
    def test_target_params_copied(self):
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
        )
        target_qf = algo.target_qf
        target_policy = algo.target_policy
        qf = algo.qf
        qf_copy = algo.qf_with_action_input
        policy = algo.policy

        # Make sure they're different to start
        random_values = [
            np.random.rand(*values.shape) for values in qf.get_param_values()
        ]
        qf.set_param_values(random_values)
        random_values = [
            np.random.rand(*values.shape)
            for values in policy.get_param_values()
        ]
        policy.set_param_values(random_values)

        self.assertParamsNotEqual(target_qf, qf)
        self.assertParamsNotEqual(target_policy, policy)
        self.assertParamsEqual(qf_copy, qf)

        algo.train()
        self.assertParamsEqual(target_qf, qf)
        self.assertParamsEqual(target_policy, policy)
        self.assertParamsEqual(qf_copy, qf)
コード例 #2
0
def lstm_launcher(variant):
    """
    Run a simple LSTM on an environment.

    :param variant: Dictionary of dictionary with the following keys:
        - algo_params
        - env_params
        - qf_params
        - policy_params
    :return:
    """
    from railrl.algos.ddpg import DDPG as MyDDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from railrl.launchers.launcher_util import get_env_settings
    env_settings = get_env_settings(**variant['env_params'])
    env = env_settings['env']
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(name_or_scope="critic",
                           env_spec=env.spec,
                           **variant.get('qf_params', {}))
    policy = FeedForwardPolicy(name_or_scope="actor",
                               env_spec=env.spec,
                               **variant.get('policy_params', {}))
    algorithm = MyDDPG(env, es, policy, qf, **variant['algo_params'])
    algorithm.train()
コード例 #3
0
def main():
    env = TfEnv(CartpoleEnv())
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    default_ddpg_params = dict(
        batch_size=128,
        n_epochs=10,
        epoch_length=1000,
        eval_samples=1000,
        max_path_length=100,
        min_pool_size=100,
    )
    exp_prefix = 'ddpg-cartpole-speed-{0}'.format(timestamp())
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **default_ddpg_params,
    )

    run_experiment_lite(
        algorithm.train(),
        n_parallel=1,
        snapshot_mode="last",
        exp_prefix=exp_prefix,
        seed=1,
    )
コード例 #4
0
def my_ddpg_launcher(variant):
	"""
	Run DDPG
	:param variant: Dictionary of dictionary with the following keys:
		- algo_params
		- env_params
		- qf_params
		- policy_params
	:return:
	"""
	from railrl.algos.ddpg import DDPG as MyDDPG
	from railrl.policies.nn_policy import FeedForwardPolicy
	from railrl.qfunctions.nn_qfunction import FeedForwardCritic
	from rllab.exploration_strategies.ou_strategy import OUStrategy
	from railrl.launchers.launcher_util import get_env_settings
	from railrl.core.tf_util import BatchNormConfig
	if ('batch_norm_params' in variant
		and variant['batch_norm_params'] is not None):
		bn_config = BatchNormConfig(**variant['batch_norm_params'])
	else:
		bn_config = None
	env_settings = get_env_settings(**variant['env_params'])
	env = env_settings['env']
	es = OUStrategy(env_spec=env.spec)
	qf = FeedForwardCritic(
		name_or_scope="critic",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant.get('qf_params', {})
	)
	policy = FeedForwardPolicy(
		name_or_scope="actor",
		env_spec=env.spec,
		batch_norm_config=bn_config,
		**variant.get('policy_params', {})
	)

	algorithm = MyDDPG(
		env,
		es,
		policy,
		qf,
		variant['tensorboard'],
		batch_norm_config=bn_config,
		**variant['algo_params'],
	)
	algorithm.train()
コード例 #5
0
def run_task(_):
    from railrl.algos.ddpg import DDPG
    from railrl.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.quadratic_naf_qfunction import QuadraticNAF
    from rllab.exploration_strategies.ou_strategy import OUStrategy
    from sandbox.rocky.tf.envs.base import TfEnv
    from rllab.envs.gym_env import GymEnv

    def gym_env(name):
        return GymEnv(name,
                      record_video=False,
                      log_dir='/tmp/gym-test',  # Ignore gym log.
                      record_log=False)

    env = TfEnv(gym_env('AxeTwoDPoint-v0'))
    ddpg_params = dict(
        batch_size=128,
        n_epochs=50,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    es = OUStrategy(env_spec=env.spec)
    qf = QuadraticNAF(
        name_or_scope="quadratic_qf",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **ddpg_params
    )
    algorithm.train()
コード例 #6
0
def example(*_):
    env = HalfCheetahEnv()
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        n_epochs=25,
        batch_size=1024,
        replay_pool_size=10000,
    )
    algorithm.train()
コード例 #7
0
def run_task(_):
    for seed in range(3):
        env = TfEnv(HalfCheetahEnv())
        es = OUStrategy(env_spec=env.spec)
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        ddpg_params = dict(
            batch_size=16,
            n_epochs=100,
            epoch_length=100,
            eval_samples=100,
            max_path_length=10,
            min_pool_size=2,
        )
        algorithm = DDPG(env, es, policy, qf, **ddpg_params)

        algorithm.train(),
コード例 #8
0
    def test_target_params_update(self):
        tau = 0.2
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            soft_target_tau=tau,
        )
        target_qf = algo.target_qf
        target_policy = algo.target_policy
        qf = algo.qf
        policy = algo.policy

        algo.train()

        orig_tc_vals = target_qf.get_param_values()
        orig_ta_vals = target_policy.get_param_values()
        orig_c_vals = qf.get_param_values()
        orig_a_vals = policy.get_param_values()
        algo.sess.run(algo.update_target_policy_op)
        algo.sess.run(algo.update_target_qf_op)
        new_tc_vals = target_qf.get_param_values()
        new_ta_vals = target_policy.get_param_values()

        for orig_tc_val, orig_c_val, new_tc_val in zip(orig_tc_vals,
                                                       orig_c_vals,
                                                       new_tc_vals):
            self.assertTrue((new_tc_val == tau * orig_c_val +
                             (1 - tau) * orig_tc_val).all())

        for orig_ta_val, orig_a_val, new_ta_val in zip(orig_ta_vals,
                                                       orig_a_vals,
                                                       new_ta_vals):
            self.assertTrue((new_ta_val == tau * orig_a_val +
                             (1 - tau) * orig_ta_val).all())
def main():
    env = TfEnv(CartpoleEnv())
    es = OUStrategy(env_spec=env.spec)
    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    default_ddpg_params = dict(
        batch_size=32,
        n_epochs=10,
        epoch_length=1000,
        eval_samples=1000,
        max_path_length=100,
        min_pool_size=1000,
    )
    sweeper = DeterministicHyperparameterSweeper(
        {'scale_reward': [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]}, )
    exp_prefix = 'ddpg-cart-reward-scale-sweep-{0}'.format(timestamp())
    for ddpg_params in sweeper.iterate_hyperparameters():
        algorithm = DDPG(
            env,
            es,
            policy,
            qf,
            scale_reward=ddpg_params['scale_reward'],
            **default_ddpg_params,
        )

        for seed in range(3):
            run_experiment_lite(
                algorithm.train(),
                n_parallel=1,
                snapshot_mode="last",
                exp_prefix=exp_prefix,
                seed=seed,
                # mode="local",
                # use_cloudpickle=True,
            )
コード例 #10
0
def main():
    stub(globals())
    env = TfEnv(CartpoleEnv())
    ddpg_params = dict(
        batch_size=128,
        n_epochs=50,
        epoch_length=1000,
        eval_samples=1000,
        discount=0.99,
        policy_learning_rate=1e-4,
        qf_learning_rate=1e-3,
        soft_target_tau=0.01,
        replay_pool_size=1000000,
        min_pool_size=256,
        scale_reward=1.0,
        max_path_length=1000,
        qf_weight_decay=0.01,
    )
    es = OUStrategy(env_spec=env.spec)
    qf = QuadraticNAF(
        name_or_scope="quadratic_qf",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="actor",
        env_spec=env.spec,
    )
    algorithm = DDPG(env, es, policy, qf, **ddpg_params)

    for seed in range(3):
        env.reset()
        run_experiment_lite(
            algorithm.train(),
            n_parallel=1,
            snapshot_mode="last",
            exp_prefix="test-qddpg-cartpole",
            seed=seed,
        )
コード例 #11
0
		env,
		es,
		policy,
		qf,
		"/data0/dianchen/box3d/ddpg_box3d_state_v4_tf_policy_{0}_qf_{1}_gamma_{2}_tau_{3}".format(
			policy_lr,
			qf_lr,
			gamma,
			tau,
		),
		qf_learning_rate=qf_lr,
		policy_learning_rate=policy_lr,
		discount=gamma,
		soft_target_tau=tau,
		gpu_ratio=0.25,
	)

	run_experiment_lite(
		algo.train(),
		exp_prefix="ddpg_box3d_state_v4_tf_policy_{0}_qf_{1}_gamma_{2}_tau_{3}".format(
			policy_lr,
			qf_lr,
			gamma,
			tau,
		),
		n_parallel=1,
		snapshot_mode="last",
		seed=seed,
		mode="local"
	)
def main():

    parser = argparse.ArgumentParser()
    # Hyperparameters
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--policy_initlr', type=float, default=1e-4)
    parser.add_argument('--qf_initlr', type=float, default=1e-3)

    parser.add_argument('--qf_decay', type=float, default=0.01)
    parser.add_argument('--qf_soft_tau', type=float, default=1e-3)

    # Exploration hyperparameters
    parser.add_argument('--ou_theta', type=float, default=0.15)
    parser.add_argument('--ou_sigma', type=float, default=0.3)

    parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('--gpu_ratio', type=float, default=0.95)

    args = parser.parse_args()

    env = TfEnv(normalize(env=GymEnv('Box3dReachPixel-v15',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))

    name = 'ddpg-pixel-v15-plr{0}-qlr{1}-tau{2}-qfdecay{3}'.format(
        args.policy_initlr, args.qf_initlr, args.qf_soft_tau, args.qf_decay)

    es = OUStrategy(env_spec=env.spec,
                    theta=args.ou_theta,
                    sigma=args.ou_sigma)
    # import pdb; pdb.set_trace()

    qf = ConvNNCritic(
        name_or_scope="critic",
        input_shape=env.observation_space.shape,
        env_spec=env.spec,
        conv_filters=(32, 32, 32, 32, 32),
        conv_filter_sizes=((3, 3), (3, 3), (3, 3), (3, 3), (3, 3)),
        conv_strides=(2, 2, 2, 2, 2),
        conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'),
        observation_hidden_sizes=(256, ),
        embedded_hidden_sizes=(256, ),
        hidden_nonlinearity=tf.nn.relu,
    )

    policy = ConvNNPolicy(
        name_or_scope="actor",
        input_shape=env.observation_space.shape,
        env_spec=env.spec,
        conv_filters=(32, 32, 32, 32, 32),
        conv_filter_sizes=((3, 3), (3, 3), (3, 3), (3, 3), (3, 3)),
        conv_strides=(2, 2, 2, 2, 2),
        conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'),
        observation_hidden_sizes=(256, 128),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.tanh,
    )

    algo = DDPG(
        env=env,
        exploration_strategy=es,
        policy=policy,
        qf=qf,
        tensorboard_path=os.path.join(args.tfboard_path,
                                      name + '_%d' % args.seed),
        replay_pool_size=100000,
        obs_dtype='uint8',
        qf_learning_rate=args.qf_initlr,
        policy_learning_rate=args.policy_initlr,
        soft_target_tau=args.qf_soft_tau,
        gpu_ratio=args.gpu_ratio,
    )

    run_experiment_lite(algo.train(),
                        exp_prefix=name,
                        n_parallel=1,
                        snapshot_mode="last",
                        seed=args.seed,
                        mode="local")
def main():

    parser = argparse.ArgumentParser()
    # Hyperparameters
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--policy_initlr', type=float, default=1e-4)
    parser.add_argument('--qf_initlr', type=float, default=1e-3)

    parser.add_argument('--qf_decay', type=float, default=.0)
    parser.add_argument('--qf_soft_tau', type=float, default=1e-3)

    # Exploration hyperparameters
    parser.add_argument('--ou_theta', type=float, default=0.15)
    parser.add_argument('--ou_sigma', type=float, default=0.3)

    parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('--gpu_ratio', type=float, default=1.0)

    args = parser.parse_args()

    env = TfEnv(normalize(env=GymEnv('Box3dReach-v11',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))

    name = 'ddpg-state-v11-plr{0}-qlr{1}-tau{2}-qfdecay{3}-ou_theta{4}-ou_sigma{5}'.format(
        args.policy_initlr, args.qf_initlr, args.qf_soft_tau, args.qf_decay,
        args.ou_theta, args.ou_sigma)

    es = OUStrategy(env_spec=env.spec,
                    theta=args.ou_theta,
                    sigma=args.ou_sigma)

    policy = FeedForwardPolicy(
        name_or_scope="actor",
        observation_hidden_sizes=(400, 300),
        env_spec=env.spec,
    )

    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
        embedded_hidden_sizes=(100, ),
        observation_hidden_sizes=(100, ),
    )

    algo = DDPG(
        env=env,
        exploration_strategy=es,
        policy=policy,
        qf=qf,
        tensorboard_path=os.path.join(args.tfboard_path, name,
                                      '_%d' % args.seed),
        qf_learning_rate=args.qf_initlr,
        policy_learning_rate=args.policy_initlr,
        soft_target_tau=args.qf_soft_tau,
        gpu_ratio=args.gpu_ratio,
    )

    run_experiment_lite(algo.train(),
                        exp_prefix=name,
                        n_parallel=1,
                        snapshot_mode="last",
                        seed=args.seed,
                        mode="local")