def classic_control(env, default_seed=False, **kwargs): if default_seed: seed = 2 set_seed(seed, env) # reproducible in_dim = env.observation_space.shape[0] act_dim = env.action_space.n params = dict( number_timesteps=int(1e4), test_episodes=10, save_path=None, save_interval=1e3, batch_size=32, double_q=True, buffer_size=1000, exploration_rate=0.2, exploration_final_eps=0.01, train_freq=4, learning_starts=200, target_network_update_freq=50, gamma=0.99, prioritized_replay=False, prioritized_alpha=0.6, prioritized_beta0=0.4, dueling=True, ) params.update(kwargs) if params.get('network') is None: params['network'] = MLPQNet(in_dim, act_dim, params.pop('dueling')) if params.get('optimizer') is None: params['optimizer'] = tf.optimizers.Adam(5e-3, epsilon=1e-5) return dict(), params
def box2d(env, default_seed=True): if default_seed: seed = 2 set_seed(seed, env) # reproducible alg_params = dict() if alg_params.get('net_list') is None: num_hidden_layer = 1 # number of hidden layers for the networks hidden_dim = 32 # dimension of hidden layers for the networks with tf.name_scope('PG'): with tf.name_scope('Policy'): policy_net = StochasticPolicyNetwork( env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) net_list = [policy_net] alg_params['net_list'] = net_list if alg_params.get('optimizers_list') is None: learning_rate = 0.02 policy_optimizer = tf.optimizers.Adam(learning_rate) optimizers_list = [policy_optimizer] alg_params['optimizers_list'] = optimizers_list learn_params = dict(train_episodes=200, test_episodes=100, max_steps=200, save_interval=20, gamma=0.95) return alg_params, learn_params
def atari(env, default_seed=False, **kwargs): if default_seed: seed = 2 set_seed(seed, env) # reproducible in_dim = env.observation_space.shape act_dim = env.action_space.n params = dict( number_timesteps=int(1e7), # for raw-pixel test_episodes=10, save_path=None, save_interval=1e4, batch_size=32, double_q=True, buffer_size=10000, exploration_rate=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, prioritized_alpha=0.6, prioritized_beta0=0.4, dueling=True) params.update(kwargs) if params.get('network') is None: params['network'] = CNNQNet(in_dim, act_dim, params.pop('dueling')) if params.get('optimizer') is None: params['optimizer'] = tf.optimizers.Adam(1e-4, epsilon=1e-5, clipnorm=10) return dict(), params
def classic_control(env, default_seed=True): if default_seed: seed = 1 set_seed(seed, env) # reproducible alg_params = dict( gamma=0.9, ) if alg_params.get('net_list') is None: num_hidden_layer = 2 # number of hidden layers for the networks hidden_dim = 64 # dimension of hidden layers for the networks with tf.name_scope('AC'): with tf.name_scope('Critic'): critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) with tf.name_scope('Actor'): actor = StochasticPolicyNetwork(env.observation_space, env.action_space, hidden_dim_list=num_hidden_layer * [hidden_dim], output_activation=tf.nn.tanh) net_list = [actor, critic] alg_params['net_list'] = net_list if alg_params.get('optimizers_list') is None: a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic a_optimizer = tf.optimizers.Adam(a_lr) c_optimizer = tf.optimizers.Adam(c_lr) optimizers_list = [a_optimizer, c_optimizer] alg_params['optimizers_list'] = optimizers_list learn_params = dict( max_steps=200, train_episodes=500, test_episodes=100, save_interval=50, ) return alg_params, learn_params
def classic_control(env, default_seed=True): if default_seed: assert isinstance(env, list) seed = np.arange(len(env)).tolist() # a list of seeds for each env set_seed(seed, env) # reproducible # for multi-threading if isinstance( env, list ): # judge if multiple envs are passed in for parallel computing num_env = len(env) # number of envs passed in env = env[0] # take one of the env as they are all the same else: num_env = 1 alg_params = dict( method='clip', # method can be clip or penalty epsilon=0.2, # for method 'clip' kl_target=0.01, # for method 'penalty' lam=0.5 # for method 'penalty' ) if alg_params.get('net_list') is None: num_hidden_layer = 2 # number of hidden layers for the networks hidden_dim = 64 # dimension of hidden layers for the networks with tf.name_scope('DPPO'): with tf.name_scope('V_Net'): v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) with tf.name_scope('Policy'): policy_net = StochasticPolicyNetwork( env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer) net_list = v_net, policy_net alg_params['net_list'] = net_list if alg_params.get('optimizers_list') is None: actor_lr = 1e-4 critic_lr = 2e-4 optimizers_list = [ tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr) ] alg_params['optimizers_list'] = optimizers_list learn_params = dict(train_episodes=1000, test_episodes=100, max_steps=200, save_interval=50, gamma=0.9, a_update_steps=10, c_update_steps=10, n_workers=num_env, batch_size=32) return alg_params, learn_params
def box2d(env, default_seed=True): if default_seed: # reproducible seed = 2 set_seed(seed, env) alg_params = dict( replay_buffer_size=10000, tau=0.01, ) if alg_params.get('net_list') is None: num_hidden_layer = 2 # number of hidden layers for the networks hidden_dim = 64 # dimension of hidden layers for the networks with tf.name_scope('DDPG'): with tf.name_scope('Q_Net'): q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) with tf.name_scope('Target_Q_Net'): target_q_net = QNetwork(env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) with tf.name_scope('Policy'): policy_net = DeterministicPolicyNetwork( env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) with tf.name_scope('Target_Policy'): target_policy_net = DeterministicPolicyNetwork( env.observation_space, env.action_space, num_hidden_layer * [hidden_dim]) net_list = [q_net, target_q_net, policy_net, target_policy_net] alg_params['net_list'] = net_list if alg_params.get('optimizers_list') is None: actor_lr = 1e-3 critic_lr = 2e-3 optimizers_list = [ tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr) ] alg_params['optimizers_list'] = optimizers_list learn_params = dict(train_episodes=100, test_episodes=10, max_steps=200, save_interval=10, explore_steps=500, batch_size=32, gamma=0.9, noise_scale=1., noise_scale_decay=0.995) return alg_params, learn_params
def box2d(env, default_seed=True): if default_seed: seed = 2 set_seed(seed, env) # reproducible alg_params = dict( replay_buffer_capacity=5e5, ) if alg_params.get('net_list') is None: num_hidden_layer = 2 # number of hidden layers for the networks hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here with tf.name_scope('SAC'): with tf.name_scope('Q_Net1'): soft_q_net1 = QNetwork(env.observation_space, env.action_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) with tf.name_scope('Q_Net2'): soft_q_net2 = QNetwork(env.observation_space, env.action_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) with tf.name_scope('Target_Q_Net1'): target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) with tf.name_scope('Target_Q_Net2'): target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) with tf.name_scope('Policy'): policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, hidden_dim_list=num_hidden_layer * [hidden_dim], output_activation=None, state_conditioned=True) net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] alg_params['net_list'] = net_list if alg_params.get('optimizers_list') is None: soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) policy_optimizer = tf.optimizers.Adam(policy_lr) alpha_optimizer = tf.optimizers.Adam(alpha_lr) optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] alg_params['optimizers_list'] = optimizers_list learn_params = dict( max_steps=150, batch_size=64, explore_steps=200, update_itr=3, policy_target_update_interval=3, reward_scale=1., AUTO_ENTROPY=True, train_episodes=100, test_episodes=10, save_interval=10, ) return alg_params, learn_params
def classic_control(env, default_seed=True): if default_seed: assert isinstance(env, list) seed = np.arange(len(env)).tolist() # a list of seeds for each env set_seed(seed, env) # reproducible # for multi-threading if isinstance( env, list ): # judge if multiple envs are passed in for parallel computing num_env = len(env) # number of envs passed in env = env[0] # take one of the env as they are all the same else: num_env = 1 alg_params = dict(entropy_beta=0.005) if alg_params.get('net_list') is None: num_hidden_layer = 4 # number of hidden layers for the networks hidden_dim = 64 # dimension of hidden layers for the networks net_list2 = [ ] # networks list of networks list, each item for single thread/process for _ in range(num_env + 1): # additional one for global with tf.name_scope('AC'): with tf.name_scope('Critic'): critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) with tf.name_scope('Actor'): actor = StochasticPolicyNetwork( env.observation_space, env.action_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) net_list = [actor, critic] net_list2.append(net_list) alg_params['net_list'] = net_list2 if alg_params.get('optimizers_list') is None: a_lr, c_lr = 1e-3, 1e-3 # a_lr: learning rate of the actor; c_lr: learning rate of the critic a_optimizer = tf.optimizers.RMSprop(a_lr, name='RMS_optimizer_actor') c_optimizer = tf.optimizers.RMSprop(c_lr, name='RMS_optimizer_critic') optimizers_list = [a_optimizer, c_optimizer] alg_params['optimizers_list'] = optimizers_list learn_params = dict(max_steps=100, gamma=0.9, train_episodes=1000, test_episodes=10, save_interval=100, update_itr=10, n_workers=num_env) return alg_params, learn_params
def classic_control(env, default_seed=True): if default_seed: # reproducible seed = 1 set_seed(seed, env) alg_params = dict( method='clip', # method can be clip or penalty epsilon=0.2, # for method 'clip' kl_target=0.01, # for method 'penalty' lam=0.5, ) # for method 'penalty' if alg_params.get('net_list') is None: num_hidden_layer = 2 # number of hidden layers for the networks hidden_dim = 64 # dimension of hidden layers for the networks with tf.name_scope('PPO'): with tf.name_scope('V_Net'): v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) with tf.name_scope('Policy'): policy_net = StochasticPolicyNetwork( env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh, trainable=True) net_list = [v_net, policy_net] alg_params['net_list'] = net_list if alg_params.get('optimizers_list') is None: actor_lr = 1e-4 critic_lr = 2e-4 optimizers_list = [ tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr) ] alg_params['optimizers_list'] = optimizers_list learn_params = dict(train_episodes=1000, test_episodes=100, max_steps=200, save_interval=50, gamma=0.9, batch_size=32, a_update_steps=10, c_update_steps=10) return alg_params, learn_params
def atari(env, default_seed=False, **kwargs): if default_seed: seed = 2 set_seed(seed, env) # reproducible assert isinstance(env.action_space, Discrete) alg_params = dict( dueling=True, double_q=True, buffer_size=1000, prioritized_replay=True, prioritized_alpha=0.6, prioritized_beta0=0.4, ) alg_params.update(kwargs) if alg_params.get('net_list') is None: alg_params['net_list'] = [ QNetwork(env.observation_space, env.action_space, [64], state_only=True, dueling=alg_params['dueling']) ] if alg_params.get('optimizers_list') is None: alg_params['optimizers_list'] = tf.optimizers.Adam(1e-4, epsilon=1e-5, clipnorm=10), learn_params = dict( train_episodes=int(1e5), test_episodes=10, max_steps=200, save_interval=1e4, batch_size=32, exploration_rate=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, ) return alg_params, learn_params
def classic_control(env, default_seed=True): if default_seed: # reproducible seed = 2 set_seed(seed, env) alg_params = dict( damping_coeff=0.1, cg_iters=10, delta=0.01 ) if alg_params.get('net_list') is None: num_hidden_layer = 2 # number of hidden layers for the networks hidden_dim = 64 # dimension of hidden layers for the networks with tf.name_scope('TRPO'): with tf.name_scope('V_Net'): v_net = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer) with tf.name_scope('Policy'): policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh) net_list = [v_net, policy_net] alg_params['net_list'] = net_list if alg_params.get('optimizers_list') is None: critic_lr = 1e-3 optimizers_list = [tf.optimizers.Adam(critic_lr)] alg_params['optimizers_list'] = optimizers_list learn_params = dict(train_episodes=2000, test_episodes=100, max_steps=200, save_interval=100, gamma=0.9, batch_size=256, backtrack_iters=10, backtrack_coeff=0.8, train_critic_iters=80) return alg_params, learn_params
from rlzoo.common.utils import make_env, set_seed from rlzoo.algorithms.ppo_clip.ppo_clip import PPO_CLIP from rlzoo.common.policy_networks import * from rlzoo.common.value_networks import * import gym """ load environment """ env = gym.make('Pendulum-v0').unwrapped # reproducible seed = 1 set_seed(seed, env) """ build networks for the algorithm """ name = 'PPO_CLIP' hidden_dim = 64 num_hidden_layer = 2 critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') actor = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer, output_activation=tf.nn.tanh, name=name + '_policy') net_list = critic, actor """ create model """ actor_lr = 1e-4 critic_lr = 2e-4 optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] model = PPO_CLIP( net_list,
import gym # from common.env_wrappers import DummyVecEnv from rlzoo.common.utils import make_env, set_seed from rlzoo.algorithms.dppo_clip.dppo_clip import DPPO_CLIP from rlzoo.common.value_networks import * from rlzoo.common.policy_networks import * n_workers = 4 ''' load environment ''' env = [gym.make('Pendulum-v0').unwrapped for i in range(n_workers)] # reproducible seed = 2 set_seed(seed) ''' build networks for the algorithm ''' name = 'DPPO_CLIP' hidden_dim = 64 num_hidden_layer = 2 critic = ValueNetwork(env[0].observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') actor = StochasticPolicyNetwork(env[0].observation_space, env[0].action_space, [hidden_dim] * num_hidden_layer, trainable=True, name=name + '_policy') net_list = critic, actor ''' create model ''' actor_lr = 1e-4 critic_lr = 2e-4
def rlbench(env, default_seed=True): if default_seed: seed = 2 set_seed(seed, env) # reproducible alg_params = dict( replay_buffer_capacity=5e5, policy_target_update_interval=5, ) if alg_params.get('net_list') is None: num_hidden_layer = 2 # number of hidden layers for the networks hidden_dim = 64 # dimension of hidden layers for the networks with tf.name_scope('TD3'): with tf.name_scope('Q_Net1'): q_net1 = QNetwork(env.observation_space, env.action_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) with tf.name_scope('Q_Net2'): q_net2 = QNetwork(env.observation_space, env.action_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) with tf.name_scope('Target_Q_Net1'): target_q_net1 = QNetwork(env.observation_space, env.action_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) with tf.name_scope('Target_Q_Net2'): target_q_net2 = QNetwork(env.observation_space, env.action_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) with tf.name_scope('Policy'): policy_net = DeterministicPolicyNetwork( env.observation_space, env.action_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) with tf.name_scope('Target_Policy'): target_policy_net = DeterministicPolicyNetwork( env.observation_space, env.action_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) net_list = [ q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net ] alg_params['net_list'] = net_list if alg_params.get('optimizers_list') is None: q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network q_optimizer1 = tf.optimizers.Adam(q_lr) q_optimizer2 = tf.optimizers.Adam(q_lr) policy_optimizer = tf.optimizers.Adam(policy_lr) optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] alg_params['optimizers_list'] = optimizers_list learn_params = dict( max_steps=150, batch_size=64, explore_steps=500, update_itr=3, reward_scale=1., explore_noise_scale=1.0, eval_noise_scale=0.5, train_episodes=100, test_episodes=10, save_interval=10, ) return alg_params, learn_params