def run_task(*_): env = normalize(Walker2DEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(H_layer_first[h], H_layer_second[h]) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=size_of_batch, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=number_of_episodes, discount=discount_factor, scale_reward=reward_scaling[r], qf_learning_rate=critic_learning_rate[c], policy_learning_rate=actor_learning_rate[c], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'LunarLanderContinuous-v3' env = GymEnv(env_name) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=350, epoch_length=350, min_pool_size=350, n_epochs=600, discount=0.99, scale_reward=1.0/140.0, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) data_path = 'data/%s_data_rllab_%s/%s/'%(env_name.replace('-', '_'), str(algo.__class__.__name__), exp_name) os.makedirs(data_path, exist_ok=True) logger.set_snapshot_dir(data_path) algo.train() logger.set_snapshot_dir(None)
def rllab_ddpg_launcher(variant): from rllab.algos.ddpg import DDPG as RllabDDPG from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.q_functions.continuous_mlp_q_function import ( ContinuousMLPQFunction as TheanoContinuousMLPQFunction ) from rllab.policies.deterministic_mlp_policy import ( DeterministicMLPPolicy as TheanoDeterministicMLPPolicy ) from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] policy = TheanoDeterministicMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32) ) es = OUStrategy(env_spec=env.spec) qf = TheanoContinuousMLPQFunction(env_spec=env.spec) algorithm = RllabDDPG( env=env, policy=policy, es=es, qf=qf, **variant['algo_params'] ) algorithm.train()
def run_task(*_): env = normalize(CartpoleEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=1000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize( GymEnv(env_name="MountainCarContinuous-v0", force_reset=True)) max_path_length = 300 policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=100, n_updates_per_sample=1, max_path_length=max_path_length, epoch_length=900, min_pool_size=800, replay_pool_size=5000, n_epochs=1000, discount=0.99, scale_reward=0.1, qf_learning_rate=1e-3, policy_learning_rate=1e-4, ) algo.train()
def run_task(*_): env = normalize(SwimmerEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=200, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()
class DDPGModel(Model): def __init__(self): self.ddpg = DDPG() def predict(self, obs): action = self.ddpg.policy.get_action(observation=obs) def train(self, batch_data): self.ddpg.train(batch_data=)
def run_task(*_): """ DPG on Hopper environment """ env = normalize(HopperEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(400, 300)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) """ Using the DDPG algorithm """ # algo = DDPG( # env=env, # policy=policy, # es=es, # qf=qf, # batch_size=32, # max_path_length=500, # epoch_length=500, # min_pool_size=10000, # n_epochs=20000, # discount=0.99, # scale_reward=0.01, # qf_learning_rate=1e-3, # policy_learning_rate=1e-4, # #Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, # ) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=64, max_path_length=1000, epoch_length=1000, min_pool_size=10000, n_epochs=10000, discount=0.99, scale_reward=0.01, qf_learning_rate=10e-3, policy_learning_rate=10e-4, #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()
def run_task(*_): """ DPG on Swimmer environment """ env = normalize(SwimmerEnv()) """ Initialise the policy as a neural network policy """ policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) """ Defining exploration strategy : OUStrategy - """ """ This strategy implements the Ornstein-Uhlenbeck process, which adds time-correlated noise to the actions taken by the deterministic policy. The OU process satisfies the following stochastic differential equation: dxt = theta*(mu - xt)*dt + sigma*dWt where Wt denotes the Wiener process """ es = OUStrategy(env_spec=env.spec) """ Defining the Q network """ qf = ContinuousMLPQFunction(env_spec=env.spec) """ Using the DDPG algorithm """ algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=500, epoch_length=500, min_pool_size=10000, n_epochs=20000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) """ Training the networks based on the DDPG algorithm """ algo.train()
def test_ddpg(): env = CartpoleEnv() policy = DeterministicMLPPolicy(env.spec) qf = ContinuousMLPQFunction(env.spec) es = OUStrategy(env.spec) algo = DDPG( env=env, policy=policy, qf=qf, es=es, n_epochs=1, epoch_length=100, batch_size=32, min_pool_size=50, replay_pool_size=1000, eval_samples=100, ) algo.train()
def test_rllab(patient_id=1, Initial_Bg=0): try: from rllab.algos.ddpg import DDPG from rllab.envs.normalized_env import normalize from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction from rllab.envs.gym_env import GymEnv except ImportError: print('rllab is not installed!') return None env = GymEnv('simglucose-adult{}-CHO{}-v0'.format(Initial_Bg, patient_id + 1)) env = normalize(env) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each # with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG(env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=5, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4) algo.train() # env.close() return es, policy
def run_task(*_): # env = normalize(HalfCheetahEnv()) env = normalize(GymEnv(env_name = "LunarLanderContinuous-v2",force_reset=True)) # env = normalize(GymEnv(env_name="BipedalWalker-v2", force_reset=True, record_video=True)) max_path_length = 400 # print("env.horizon: ",env.horizon) # input() # env._max_episode_steps = max_path_length policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64) ) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=max_path_length, train_epoch_interval=300, min_pool_size=500, replay_pool_size = 10000, n_updates_per_sample =1, n_steps = 75000, discount=0.99, scale_reward=0.1, qf_learning_rate=1e-2, policy_learning_rate=1e-3, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(GymEnv(args.env, force_reset=True, record_video=False)) env.wrapped_env.env.env.reward_flag = args.reward if args.hidden_sizes == 0: hidden_sizes=(8,) elif args.hidden_sizes == 1: hidden_sizes=(32, 32) elif args.hidden_sizes == 2: hidden_sizes=(100, 50, 25) elif args.hidden_sizes == 3: hidden_sizes=(400, 300) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_sizes ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=64, max_path_length=95, epoch_length=args.batch_size, min_pool_size=10000, n_epochs=args.n_itr, discount=args.gamma, scale_reward=args.scale_reward, qf_learning_rate=1e-3, policy_learning_rate=1e-4, eval_samples=95, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
es = OUStrategy(env_spec=env.spec, theta=0.5) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32, 32)) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=100, min_pool_size=1000, n_epochs=1000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) else: policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 128))
def __init__(self): self.ddpg = DDPG()
print "#Experiments number:", num variant = variants[num] # es = OUStrategy(env_spec=env.spec, theta=0.15, sigma=0.3) es = GaussianStrategy(env_spec=env.spec, max_sigma=1.0, min_sigma=0.1, decay_period=variant["decay_period"]) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=35, max_path_length=100, epoch_length=5000, min_pool_size=10000, n_epochs=100, discount=0.99, scale_reward=variant["scale_reward"], soft_target_tau=1e-3, qf_learning_rate=variant["qf_learning_rate"], policy_learning_rate=variant["policy_learning_rate"], #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, eval_samples=5000, ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration
hidden_sizes=(32, 32) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=1000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last",
output_W_init=LI.Uniform(-3e-6, 3e-6), output_b_init=LI.Uniform(-3e-6, 3e-6), ) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=256, # Number of samples for each minibatch. max_path_length=1500, # 5 seconds epoch_length=15000, # How many timesteps for each epoch. min_pool_size=15000, # Minimum size of the pool to start training. replay_pool_size=15000000, n_epochs= 1000, # Number of epochs. Policy will be evaluated after each epoch. eval_samples= 15000, # Number of samples (timesteps) for evaluating the policy. discount=1.0, scale_reward=0.1, # The scaling factor applied to the rewards when training qf_learning_rate=1e-3, # Learning rate for training Q function policy_learning_rate=1e-4, # Learning rate for training the policy #qf_weight_decay=0.01, soft_target_tau= 0.005, # Interpolation parameter for doing the soft target update. # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) log_dir = os.path.join(os.getcwd(), 'data') logger.set_snapshot_dir(log_dir) logger.add_text_output(os.path.join(log_dir, 'debug.log'))
env = GymEnv('simglucose-adolescent2-v0') env = normalize(env) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=3, min_pool_size=10000, n_epochs=1000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4 ) algo.train()
es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_nonlinearity=activation_map[args.vf_activation], hidden_sizes=args.vf_size, ) algo = DDPG(env=env, policy=policy, es=es, qf=qf, batch_size=128, max_path_length=env.horizon, epoch_length=1000, min_pool_size=10000, n_epochs=args.num_epochs, discount=0.995, scale_reward=args.reward_scale, qf_learning_rate=1e-3, policy_learning_rate=1e-4, plot=False) run_experiment_lite( algo.train(), log_dir=None if args.use_ec2 else args.log_dir, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed
# ======================= # Defining the algorithm # ======================= es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, max_path_length=96, epoch_length=1000, min_pool_size=10000, batch_size=batch_size, discount=gamma, n_epochs=n_itr, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, ) # Formatting string for data directory hidden_arc = [str(i) for i in hidden_sizes] hidden_arc = '_'.join(hidden_arc) data_dir = 'DDPG_{}_nIters_{}_stepSize_{}_gamma_{}_initStd_{}{}_policyPar_{}_reward_{}'\ .format(batch_size, n_itr, step_size,''.join(str(gamma).split('.')), init_std, learn_std, hidden_arc, reward_fun)