Ejemplo n.º 1
0
def classic_control(env, default_seed=False, **kwargs):
    if default_seed:
        seed = 2
        set_seed(seed, env)  # reproducible

    in_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    params = dict(
        number_timesteps=int(1e4),
        test_episodes=10,
        save_path=None,
        save_interval=1e3,
        batch_size=32,
        double_q=True,
        buffer_size=1000,
        exploration_rate=0.2,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=200,
        target_network_update_freq=50,
        gamma=0.99,
        prioritized_replay=False,
        prioritized_alpha=0.6,
        prioritized_beta0=0.4,
        dueling=True,
    )
    params.update(kwargs)
    if params.get('network') is None:
        params['network'] = MLPQNet(in_dim, act_dim, params.pop('dueling'))
    if params.get('optimizer') is None:
        params['optimizer'] = tf.optimizers.Adam(5e-3, epsilon=1e-5)
    return dict(), params
Ejemplo n.º 2
0
def box2d(env, default_seed=True):
    if default_seed:
        seed = 2
        set_seed(seed, env)  # reproducible

    alg_params = dict()

    if alg_params.get('net_list') is None:
        num_hidden_layer = 1  # number of hidden layers for the networks
        hidden_dim = 32  # dimension of hidden layers for the networks
        with tf.name_scope('PG'):
            with tf.name_scope('Policy'):
                policy_net = StochasticPolicyNetwork(
                    env.observation_space, env.action_space,
                    num_hidden_layer * [hidden_dim])
        net_list = [policy_net]
        alg_params['net_list'] = net_list

    if alg_params.get('optimizers_list') is None:
        learning_rate = 0.02
        policy_optimizer = tf.optimizers.Adam(learning_rate)
        optimizers_list = [policy_optimizer]
        alg_params['optimizers_list'] = optimizers_list

    learn_params = dict(train_episodes=200,
                        test_episodes=100,
                        max_steps=200,
                        save_interval=20,
                        gamma=0.95)

    return alg_params, learn_params
Ejemplo n.º 3
0
def atari(env, default_seed=False, **kwargs):
    if default_seed:
        seed = 2
        set_seed(seed, env)  # reproducible

    in_dim = env.observation_space.shape
    act_dim = env.action_space.n
    params = dict(
        number_timesteps=int(1e7),  # for raw-pixel
        test_episodes=10,
        save_path=None,
        save_interval=1e4,
        batch_size=32,
        double_q=True,
        buffer_size=10000,
        exploration_rate=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=True,
        prioritized_alpha=0.6,
        prioritized_beta0=0.4,
        dueling=True)
    params.update(kwargs)

    if params.get('network') is None:
        params['network'] = CNNQNet(in_dim, act_dim, params.pop('dueling'))
    if params.get('optimizer') is None:
        params['optimizer'] = tf.optimizers.Adam(1e-4,
                                                 epsilon=1e-5,
                                                 clipnorm=10)
    return dict(), params
Ejemplo n.º 4
0
def classic_control(env, default_seed=True):
    if default_seed:
        seed = 1
        set_seed(seed, env)  # reproducible

    alg_params = dict(
        gamma=0.9,
    )
    if alg_params.get('net_list') is None:
        num_hidden_layer = 2  # number of hidden layers for the networks
        hidden_dim = 64  # dimension of hidden layers for the networks
        with tf.name_scope('AC'):
            with tf.name_scope('Critic'):
                critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim])
            with tf.name_scope('Actor'):
                actor = StochasticPolicyNetwork(env.observation_space, env.action_space,
                                                hidden_dim_list=num_hidden_layer * [hidden_dim],
                                                output_activation=tf.nn.tanh)
        net_list = [actor, critic]
        alg_params['net_list'] = net_list
    if alg_params.get('optimizers_list') is None:
        a_lr, c_lr = 1e-4, 2e-4  # a_lr: learning rate of the actor; c_lr: learning rate of the critic
        a_optimizer = tf.optimizers.Adam(a_lr)
        c_optimizer = tf.optimizers.Adam(c_lr)
        optimizers_list = [a_optimizer, c_optimizer]
        alg_params['optimizers_list'] = optimizers_list

    learn_params = dict(
        max_steps=200,
        train_episodes=500,
        test_episodes=100,
        save_interval=50,
    )

    return alg_params, learn_params
Ejemplo n.º 5
0
def classic_control(env, default_seed=True):
    if default_seed:
        assert isinstance(env, list)
        seed = np.arange(len(env)).tolist()  # a list of seeds for each env
        set_seed(seed, env)  # reproducible

    # for multi-threading
    if isinstance(
            env, list
    ):  # judge if multiple envs are passed in for parallel computing
        num_env = len(env)  # number of envs passed in
        env = env[0]  # take one of the env as they are all the same
    else:
        num_env = 1

    alg_params = dict(
        method='clip',  # method can be clip or penalty
        epsilon=0.2,  # for method 'clip'
        kl_target=0.01,  # for method 'penalty'
        lam=0.5  # for method 'penalty'
    )

    if alg_params.get('net_list') is None:
        num_hidden_layer = 2  # number of hidden layers for the networks
        hidden_dim = 64  # dimension of hidden layers for the networks
        with tf.name_scope('DPPO'):
            with tf.name_scope('V_Net'):
                v_net = ValueNetwork(env.observation_space,
                                     [hidden_dim] * num_hidden_layer)
            with tf.name_scope('Policy'):
                policy_net = StochasticPolicyNetwork(
                    env.observation_space, env.action_space,
                    [hidden_dim] * num_hidden_layer)

        net_list = v_net, policy_net
        alg_params['net_list'] = net_list

    if alg_params.get('optimizers_list') is None:
        actor_lr = 1e-4
        critic_lr = 2e-4
        optimizers_list = [
            tf.optimizers.Adam(critic_lr),
            tf.optimizers.Adam(actor_lr)
        ]
        alg_params['optimizers_list'] = optimizers_list

    learn_params = dict(train_episodes=1000,
                        test_episodes=100,
                        max_steps=200,
                        save_interval=50,
                        gamma=0.9,
                        a_update_steps=10,
                        c_update_steps=10,
                        n_workers=num_env,
                        batch_size=32)

    return alg_params, learn_params
Ejemplo n.º 6
0
def box2d(env, default_seed=True):
    if default_seed:
        # reproducible
        seed = 2
        set_seed(seed, env)

    alg_params = dict(
        replay_buffer_size=10000,
        tau=0.01,
    )

    if alg_params.get('net_list') is None:
        num_hidden_layer = 2  # number of hidden layers for the networks
        hidden_dim = 64  # dimension of hidden layers for the networks
        with tf.name_scope('DDPG'):
            with tf.name_scope('Q_Net'):
                q_net = QNetwork(env.observation_space, env.action_space,
                                 num_hidden_layer * [hidden_dim])
            with tf.name_scope('Target_Q_Net'):
                target_q_net = QNetwork(env.observation_space,
                                        env.action_space,
                                        num_hidden_layer * [hidden_dim])
            with tf.name_scope('Policy'):
                policy_net = DeterministicPolicyNetwork(
                    env.observation_space, env.action_space,
                    num_hidden_layer * [hidden_dim])
            with tf.name_scope('Target_Policy'):
                target_policy_net = DeterministicPolicyNetwork(
                    env.observation_space, env.action_space,
                    num_hidden_layer * [hidden_dim])

        net_list = [q_net, target_q_net, policy_net, target_policy_net]
        alg_params['net_list'] = net_list

    if alg_params.get('optimizers_list') is None:
        actor_lr = 1e-3
        critic_lr = 2e-3
        optimizers_list = [
            tf.optimizers.Adam(critic_lr),
            tf.optimizers.Adam(actor_lr)
        ]
        alg_params['optimizers_list'] = optimizers_list

    learn_params = dict(train_episodes=100,
                        test_episodes=10,
                        max_steps=200,
                        save_interval=10,
                        explore_steps=500,
                        batch_size=32,
                        gamma=0.9,
                        noise_scale=1.,
                        noise_scale_decay=0.995)

    return alg_params, learn_params
Ejemplo n.º 7
0
def box2d(env, default_seed=True):
    if default_seed:
        seed = 2
        set_seed(seed, env)  # reproducible

    alg_params = dict(
        replay_buffer_capacity=5e5,
    )
    if alg_params.get('net_list') is None:
        num_hidden_layer = 2  # number of hidden layers for the networks
        hidden_dim = 64  # dimension of hidden layers for the networks, default as the same for each layer here
        with tf.name_scope('SAC'):
            with tf.name_scope('Q_Net1'):
                soft_q_net1 = QNetwork(env.observation_space, env.action_space,
                                       hidden_dim_list=num_hidden_layer * [hidden_dim])
            with tf.name_scope('Q_Net2'):
                soft_q_net2 = QNetwork(env.observation_space, env.action_space,
                                       hidden_dim_list=num_hidden_layer * [hidden_dim])
            with tf.name_scope('Target_Q_Net1'):
                target_soft_q_net1 = QNetwork(env.observation_space, env.action_space,
                                              hidden_dim_list=num_hidden_layer * [hidden_dim])
            with tf.name_scope('Target_Q_Net2'):
                target_soft_q_net2 = QNetwork(env.observation_space, env.action_space,
                                              hidden_dim_list=num_hidden_layer * [hidden_dim])
            with tf.name_scope('Policy'):
                policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space,
                                                     hidden_dim_list=num_hidden_layer * [hidden_dim],
                                                     output_activation=None,
                                                     state_conditioned=True)
        net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net]
        alg_params['net_list'] = net_list
    if alg_params.get('optimizers_list') is None:
        soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4  # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha
        soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr)
        soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr)
        policy_optimizer = tf.optimizers.Adam(policy_lr)
        alpha_optimizer = tf.optimizers.Adam(alpha_lr)
        optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer]
        alg_params['optimizers_list'] = optimizers_list

    learn_params = dict(
        max_steps=150,
        batch_size=64,
        explore_steps=200,
        update_itr=3,
        policy_target_update_interval=3,
        reward_scale=1.,
        AUTO_ENTROPY=True,
        train_episodes=100,
        test_episodes=10,
        save_interval=10,
    )

    return alg_params, learn_params
Ejemplo n.º 8
0
def classic_control(env, default_seed=True):
    if default_seed:
        assert isinstance(env, list)
        seed = np.arange(len(env)).tolist()  # a list of seeds for each env
        set_seed(seed, env)  # reproducible

    # for multi-threading
    if isinstance(
            env, list
    ):  # judge if multiple envs are passed in for parallel computing
        num_env = len(env)  # number of envs passed in
        env = env[0]  # take one of the env as they are all the same
    else:
        num_env = 1

    alg_params = dict(entropy_beta=0.005)
    if alg_params.get('net_list') is None:
        num_hidden_layer = 4  # number of hidden layers for the networks
        hidden_dim = 64  # dimension of hidden layers for the networks
        net_list2 = [
        ]  # networks list of networks list, each item for single thread/process
        for _ in range(num_env + 1):  # additional one for global
            with tf.name_scope('AC'):
                with tf.name_scope('Critic'):
                    critic = ValueNetwork(env.observation_space,
                                          hidden_dim_list=num_hidden_layer *
                                          [hidden_dim])
                with tf.name_scope('Actor'):
                    actor = StochasticPolicyNetwork(
                        env.observation_space,
                        env.action_space,
                        hidden_dim_list=num_hidden_layer * [hidden_dim])
            net_list = [actor, critic]
            net_list2.append(net_list)
        alg_params['net_list'] = net_list2
    if alg_params.get('optimizers_list') is None:
        a_lr, c_lr = 1e-3, 1e-3  # a_lr: learning rate of the actor; c_lr: learning rate of the critic
        a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor')
        c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic')
        optimizers_list = [a_optimizer, c_optimizer]
        alg_params['optimizers_list'] = optimizers_list

    learn_params = dict(max_steps=100,
                        gamma=0.9,
                        train_episodes=1000,
                        test_episodes=10,
                        save_interval=100,
                        update_itr=10,
                        n_workers=num_env)

    return alg_params, learn_params
Ejemplo n.º 9
0
def classic_control(env, default_seed=True):
    if default_seed:
        # reproducible
        seed = 1
        set_seed(seed, env)

    alg_params = dict(
        method='clip',  # method can be clip or penalty
        epsilon=0.2,  # for method 'clip'
        kl_target=0.01,  # for method 'penalty'
        lam=0.5,
    )  # for method 'penalty'

    if alg_params.get('net_list') is None:
        num_hidden_layer = 2  # number of hidden layers for the networks
        hidden_dim = 64  # dimension of hidden layers for the networks
        with tf.name_scope('PPO'):
            with tf.name_scope('V_Net'):
                v_net = ValueNetwork(env.observation_space,
                                     [hidden_dim] * num_hidden_layer)
            with tf.name_scope('Policy'):
                policy_net = StochasticPolicyNetwork(
                    env.observation_space,
                    env.action_space, [hidden_dim] * num_hidden_layer,
                    output_activation=tf.nn.tanh,
                    trainable=True)
        net_list = [v_net, policy_net]
        alg_params['net_list'] = net_list

    if alg_params.get('optimizers_list') is None:
        actor_lr = 1e-4
        critic_lr = 2e-4
        optimizers_list = [
            tf.optimizers.Adam(critic_lr),
            tf.optimizers.Adam(actor_lr)
        ]
        alg_params['optimizers_list'] = optimizers_list

    learn_params = dict(train_episodes=1000,
                        test_episodes=100,
                        max_steps=200,
                        save_interval=50,
                        gamma=0.9,
                        batch_size=32,
                        a_update_steps=10,
                        c_update_steps=10)

    return alg_params, learn_params
Ejemplo n.º 10
0
def atari(env, default_seed=False, **kwargs):
    if default_seed:
        seed = 2
        set_seed(seed, env)  # reproducible

    assert isinstance(env.action_space, Discrete)

    alg_params = dict(
        dueling=True,
        double_q=True,
        buffer_size=1000,
        prioritized_replay=True,
        prioritized_alpha=0.6,
        prioritized_beta0=0.4,
    )
    alg_params.update(kwargs)
    if alg_params.get('net_list') is None:
        alg_params['net_list'] = [
            QNetwork(env.observation_space,
                     env.action_space, [64],
                     state_only=True,
                     dueling=alg_params['dueling'])
        ]

    if alg_params.get('optimizers_list') is None:
        alg_params['optimizers_list'] = tf.optimizers.Adam(1e-4,
                                                           epsilon=1e-5,
                                                           clipnorm=10),

    learn_params = dict(
        train_episodes=int(1e5),
        test_episodes=10,
        max_steps=200,
        save_interval=1e4,
        batch_size=32,
        exploration_rate=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
    )

    return alg_params, learn_params
Ejemplo n.º 11
0
def classic_control(env, default_seed=True):
    if default_seed:
        # reproducible
        seed = 2
        set_seed(seed, env)

    alg_params = dict(
        damping_coeff=0.1,
        cg_iters=10,
        delta=0.01
    )

    if alg_params.get('net_list') is None:
        num_hidden_layer = 2  # number of hidden layers for the networks
        hidden_dim = 64  # dimension of hidden layers for the networks
        with tf.name_scope('TRPO'):
            with tf.name_scope('V_Net'):
                v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer)
            with tf.name_scope('Policy'):
                policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space,
                                                     [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh)

        net_list = [v_net, policy_net]
        alg_params['net_list'] = net_list

    if alg_params.get('optimizers_list') is None:
        critic_lr = 1e-3
        optimizers_list = [tf.optimizers.Adam(critic_lr)]
        alg_params['optimizers_list'] = optimizers_list

    learn_params = dict(train_episodes=2000,
                        test_episodes=100,
                        max_steps=200,
                        save_interval=100,
                        gamma=0.9,
                        batch_size=256,
                        backtrack_iters=10,
                        backtrack_coeff=0.8,
                        train_critic_iters=80)

    return alg_params, learn_params
Ejemplo n.º 12
0
from rlzoo.common.utils import make_env, set_seed
from rlzoo.algorithms.ppo_clip.ppo_clip import PPO_CLIP
from rlzoo.common.policy_networks import *
from rlzoo.common.value_networks import *
import gym
""" load environment """
env = gym.make('Pendulum-v0').unwrapped

# reproducible
seed = 1
set_seed(seed, env)
""" build networks for the algorithm """
name = 'PPO_CLIP'
hidden_dim = 64
num_hidden_layer = 2
critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer,
                      name=name + '_value')

actor = StochasticPolicyNetwork(env.observation_space,
                                env.action_space,
                                [hidden_dim] * num_hidden_layer,
                                output_activation=tf.nn.tanh,
                                name=name + '_policy')
net_list = critic, actor
""" create model """
actor_lr = 1e-4
critic_lr = 2e-4
optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)]

model = PPO_CLIP(
    net_list,
Ejemplo n.º 13
0
import gym

# from common.env_wrappers import DummyVecEnv
from rlzoo.common.utils import make_env, set_seed
from rlzoo.algorithms.dppo_clip.dppo_clip import DPPO_CLIP
from rlzoo.common.value_networks import *
from rlzoo.common.policy_networks import *

n_workers = 4
''' load environment '''
env = [gym.make('Pendulum-v0').unwrapped for i in range(n_workers)]

# reproducible
seed = 2
set_seed(seed)

''' build networks for the algorithm '''
name = 'DPPO_CLIP'
hidden_dim = 64
num_hidden_layer = 2
critic = ValueNetwork(env[0].observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value')

actor = StochasticPolicyNetwork(env[0].observation_space, env[0].action_space,
                                [hidden_dim] * num_hidden_layer,
                                trainable=True,
                                name=name + '_policy')
net_list = critic, actor

''' create model '''
actor_lr = 1e-4
critic_lr = 2e-4
Ejemplo n.º 14
0
def rlbench(env, default_seed=True):
    if default_seed:
        seed = 2
        set_seed(seed, env)  # reproducible

    alg_params = dict(
        replay_buffer_capacity=5e5,
        policy_target_update_interval=5,
    )
    if alg_params.get('net_list') is None:
        num_hidden_layer = 2  # number of hidden layers for the networks
        hidden_dim = 64  # dimension of hidden layers for the networks
        with tf.name_scope('TD3'):
            with tf.name_scope('Q_Net1'):
                q_net1 = QNetwork(env.observation_space,
                                  env.action_space,
                                  hidden_dim_list=num_hidden_layer *
                                  [hidden_dim])
            with tf.name_scope('Q_Net2'):
                q_net2 = QNetwork(env.observation_space,
                                  env.action_space,
                                  hidden_dim_list=num_hidden_layer *
                                  [hidden_dim])
            with tf.name_scope('Target_Q_Net1'):
                target_q_net1 = QNetwork(env.observation_space,
                                         env.action_space,
                                         hidden_dim_list=num_hidden_layer *
                                         [hidden_dim])
            with tf.name_scope('Target_Q_Net2'):
                target_q_net2 = QNetwork(env.observation_space,
                                         env.action_space,
                                         hidden_dim_list=num_hidden_layer *
                                         [hidden_dim])
            with tf.name_scope('Policy'):
                policy_net = DeterministicPolicyNetwork(
                    env.observation_space,
                    env.action_space,
                    hidden_dim_list=num_hidden_layer * [hidden_dim])
            with tf.name_scope('Target_Policy'):
                target_policy_net = DeterministicPolicyNetwork(
                    env.observation_space,
                    env.action_space,
                    hidden_dim_list=num_hidden_layer * [hidden_dim])
        net_list = [
            q_net1, q_net2, target_q_net1, target_q_net2, policy_net,
            target_policy_net
        ]
        alg_params['net_list'] = net_list
    if alg_params.get('optimizers_list') is None:
        q_lr, policy_lr = 3e-4, 3e-4  # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network
        q_optimizer1 = tf.optimizers.Adam(q_lr)
        q_optimizer2 = tf.optimizers.Adam(q_lr)
        policy_optimizer = tf.optimizers.Adam(policy_lr)
        optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer]
        alg_params['optimizers_list'] = optimizers_list

    learn_params = dict(
        max_steps=150,
        batch_size=64,
        explore_steps=500,
        update_itr=3,
        reward_scale=1.,
        explore_noise_scale=1.0,
        eval_noise_scale=0.5,
        train_episodes=100,
        test_episodes=10,
        save_interval=10,
    )

    return alg_params, learn_params