def run_task(*_): """ DPG on Swimmer environment """ env = normalize(SwimmerEnv()) """ Initialise the policy as a neural network policy """ policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) """ Defining exploration strategy : OUStrategy - """ """ This strategy implements the Ornstein-Uhlenbeck process, which adds time-correlated noise to the actions taken by the deterministic policy. The OU process satisfies the following stochastic differential equation: dxt = theta*(mu - xt)*dt + sigma*dWt where Wt denotes the Wiener process """ es = OUStrategy(env_spec=env.spec) """ Defining the Q network """ qf = ContinuousMLPQFunction(env_spec=env.spec) w = qf.get_param_values(regularizable=True) """ Persistence Length Exploration """ lp = Persistence_Length_Exploration(env=env, qf=qf, policy=policy) """ Using the DDPG algorithm """ algo = DDPG( env=env, policy=policy, es=es, qf=qf, lp=lp, batch_size=32, max_path_length=1000, epoch_length=1000, min_pool_size=10000, n_epochs=15000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) """ Training the networks based on the DDPG algorithm """ algo.train()
def run_task(*_): f = open('/home/qingkai/ddpg_performance.csv', "w+") env = PointGatherEnv(apple_reward=10, bomb_cost=1, n_apples=2, activity_range=6) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) qf_cost = ContinuousMLPQFunction(env_spec=env.spec) safety_constraint = GatherSafetyConstraint(max_value=0.2) algo = PDO_DDPG( env=env, policy=policy, es=es, qf=qf, qf_cost=qf_cost, dual_var=0, safety_constraint=safety_constraint, batch_size=64, max_path_length=15, epoch_length=10000, min_pool_size=10000, n_epochs=150, discount=0.99, qf_learning_rate=1e-3, qf_cost_learning_rate=1e-3, dual_learning_rate=1e-2, policy_learning_rate=1e-3, scale_reward=1, scale_cost=5, soft_target=True, soft_target_tau=0.001, eval_samples=10000, qf_weight_decay=0., qf_cost_weight_decay=0., avg_horizon=100000, #plot=True, ) algo.train() f.close()
def run_task(*_): env = normalize( GymEnv(env_name="MountainCarContinuous-v0", force_reset=True)) max_path_length = 300 policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=100, n_updates_per_sample=1, max_path_length=max_path_length, epoch_length=900, min_pool_size=800, replay_pool_size=5000, n_epochs=1000, discount=0.99, scale_reward=0.1, qf_learning_rate=1e-3, policy_learning_rate=1e-4, ) algo.train()
def run_task(*_): env = normalize(SwimmerEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=200, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()
def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'LunarLanderContinuous-v3' env = GymEnv(env_name) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=350, epoch_length=350, min_pool_size=350, n_epochs=600, discount=0.99, scale_reward=1.0/140.0, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) data_path = 'data/%s_data_rllab_%s/%s/'%(env_name.replace('-', '_'), str(algo.__class__.__name__), exp_name) os.makedirs(data_path, exist_ok=True) logger.set_snapshot_dir(data_path) algo.train() logger.set_snapshot_dir(None)
def run_task(*_): env = normalize(Walker2DEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(H_layer_first[h], H_layer_second[h]) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=size_of_batch, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=number_of_episodes, discount=discount_factor, scale_reward=reward_scaling[r], qf_learning_rate=critic_learning_rate[c], policy_learning_rate=actor_learning_rate[c], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(SimpleHumanoidEnv()) # env = SimpleHumanoidEnv() policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(32, 32)) """ Persistence Length Exploration """ lp = Persistence_Length_Exploration( env=env, qf=qf, policy=policy, L_p=L_p_param[l_p_ind], b_step_size=b_step_size[b_ind], sigma=sigma_param[s_ind], max_exploratory_steps=max_exploratory_steps_iters, batch_size=batch_size_value, n_epochs=num_episodes, scale_reward=0.01, epoch_length=steps_per_episode, qf_learning_rate=0.001, policy_learning_rate=0.0001, ) """ DDPG """ algo = DDPG( env=env, policy=policy, es=es, qf=qf, lp=lp, batch_size=batch_size_value, max_path_length=100, epoch_length=steps_per_episode, min_pool_size=10000, n_epochs=num_episodes, discount=0.99, scale_reward=0.01, qf_learning_rate=0.001, policy_learning_rate=0.0001, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): """ DPG on Hopper environment """ env = normalize(HopperEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(400, 300)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) """ Using the DDPG algorithm """ # algo = DDPG( # env=env, # policy=policy, # es=es, # qf=qf, # batch_size=32, # max_path_length=500, # epoch_length=500, # min_pool_size=10000, # n_epochs=20000, # discount=0.99, # scale_reward=0.01, # qf_learning_rate=1e-3, # policy_learning_rate=1e-4, # #Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, # ) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=64, max_path_length=1000, epoch_length=1000, min_pool_size=10000, n_epochs=10000, discount=0.99, scale_reward=0.01, qf_learning_rate=10e-3, policy_learning_rate=10e-4, #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()
def test_ddpg(): env = CartpoleEnv() policy = DeterministicMLPPolicy(env.spec) qf = ContinuousMLPQFunction(env.spec) es = OUStrategy(env.spec) algo = DDPG( env=env, policy=policy, qf=qf, es=es, n_epochs=1, epoch_length=100, batch_size=32, min_pool_size=50, replay_pool_size=1000, eval_samples=100, ) algo.train()
def test_rllab(patient_id=1, Initial_Bg=0): try: from rllab.algos.ddpg import DDPG from rllab.envs.normalized_env import normalize from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction from rllab.envs.gym_env import GymEnv except ImportError: print('rllab is not installed!') return None env = GymEnv('simglucose-adult{}-CHO{}-v0'.format(Initial_Bg, patient_id + 1)) env = normalize(env) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each # with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG(env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=5, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4) algo.train() # env.close() return es, policy
def run_task(*_): # env = normalize(HalfCheetahEnv()) env = normalize(GymEnv(env_name = "LunarLanderContinuous-v2",force_reset=True)) # env = normalize(GymEnv(env_name="BipedalWalker-v2", force_reset=True, record_video=True)) max_path_length = 400 # print("env.horizon: ",env.horizon) # input() # env._max_episode_steps = max_path_length policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64) ) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=max_path_length, train_epoch_interval=300, min_pool_size=500, replay_pool_size = 10000, n_updates_per_sample =1, n_steps = 75000, discount=0.99, scale_reward=0.1, qf_learning_rate=1e-2, policy_learning_rate=1e-3, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(GymEnv(args.env, force_reset=True, record_video=False)) env.wrapped_env.env.env.reward_flag = args.reward if args.hidden_sizes == 0: hidden_sizes=(8,) elif args.hidden_sizes == 1: hidden_sizes=(32, 32) elif args.hidden_sizes == 2: hidden_sizes=(100, 50, 25) elif args.hidden_sizes == 3: hidden_sizes=(400, 300) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_sizes ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=64, max_path_length=95, epoch_length=args.batch_size, min_pool_size=10000, n_epochs=args.n_itr, discount=args.gamma, scale_reward=args.scale_reward, qf_learning_rate=1e-3, policy_learning_rate=1e-4, eval_samples=95, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction from rllab.misc import instrument import sys instrument.stub(globals()) env = normalize(PegEnv(), normalize_reward=True) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(42, 42) ) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=(42, 42) ) vg = instrument.VariantGenerator() vg.add("scale_reward", [0.01])#, 0.001, 0.1]) vg.add("policy_learning_rate", [1e-4])#, 1e-3, 1e-5]) vg.add("qf_learning_rate", [1e-3]) #, 1e-3, 1e-4]) vg.add("decay_period", [1E+6, 1E+5, 1E+4, 1E+3, 1E+7, 1E+8, 1E+9, 1E+10]) variants = vg.variants() num = eval(sys.argv[1]) print "#Experiments number:", num variant = variants[num] # es = OUStrategy(env_spec=env.spec, theta=0.15, sigma=0.3)
envs = { "Arm": ArmEnv, "Stand": StandEnv, "Gait": GaitEnv, "Crouch": CrouchEnv, "Hop": HopEnv } env = normalize(envs[parsed.env](visualize=False)) # env = normalize(CartpoleEnv()) # env = normalize(GymEnv("Pendulum-v0", record_video=False, record_log=False)) if alg == "DDPG": qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64, 64)) es = OUStrategy(env_spec=env.spec, theta=0.5) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32, 32)) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100,
"tanh": NL.tanh, "leaky_relu": NL.LeakyRectify } policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=args.policy_size, hidden_nonlinearity=activation_map[args.policy_activation], ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_nonlinearity=activation_map[args.vf_activation], hidden_sizes=args.vf_size, ) algo = DDPG(env=env, policy=policy, es=es, qf=qf, batch_size=128, max_path_length=env.horizon, epoch_length=1000, min_pool_size=10000, n_epochs=args.num_epochs, discount=0.995, scale_reward=args.reward_scale, qf_learning_rate=1e-3,
env = normalize(FWMAVSimEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, hidden_nonlinearity=NL.tanh, #NL.rectify,LeakyRectify output_nonlinearity=NL.tanh, hidden_sizes=(32, 32), ) es = OUStrategy( env_spec=env.spec, theta=0.15, sigma=0.3 ) #theta = decay rate of noise (small decay slower, fluctuate more, theta = 0.01 is about 220 steps, theta = 0.1 is about 20 steps, 0.15 is 15 step, 0.022 is 100 step), sigma = variation or the size of the noise qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, hidden_sizes=(128, 128), output_W_init=LI.Uniform(-3e-6, 3e-6), output_b_init=LI.Uniform(-3e-6, 3e-6), ) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=256, # Number of samples for each minibatch. max_path_length=1500, # 5 seconds epoch_length=15000, # How many timesteps for each epoch. min_pool_size=15000, # Minimum size of the pool to start training. replay_pool_size=15000000, n_epochs=
max_path_length=env.horizon, n_itr=2000000, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) else: policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=1000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4,
def run_task(*_): f = open('/home/qingkai/verina.csv', "w+") trpo_stepsize = 0.01 trpo_subsample_factor = 0.2 env = PointGatherEnv(apple_reward=10, bomb_cost=1, n_apples=2, activity_range=6) policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 32)) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args={ 'hidden_sizes': (64, 32), 'hidden_nonlinearity': NL.tanh, 'learn_std': False, 'step_size': trpo_stepsize, 'optimizer': ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor) }) safety_constraint = GatherSafetyConstraint(max_value=0.2) ddpg_policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 32)) ddpg_es = OUStrategy(env_spec=env.spec) ddpg_qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(100, 100)) ddpg_qf_cost = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(100, 100)) offline_itr_n = 100000 algo = PDO_OFF( env=env, policy=policy, baseline=baseline, safety_constraint=safety_constraint, batch_size=20000, max_path_length=15, n_itr=200, gae_lambda=0.95, discount=0.995, step_size=trpo_stepsize, optimizer_args={'subsample_factor': trpo_subsample_factor}, ddpg_policy=ddpg_policy, ddpg_qf=ddpg_qf, ddpg_qf_cost=ddpg_qf_cost, ddpg_es=ddpg_es, ddpg_dual_var=0, ddpg_batch_size=64, ddpg_qf_learning_rate=1e-4, ddpg_qf_cost_learning_rate=1e-4, ddpg_dual_learning_rate=1e-3, ddpg_policy_learning_rate=1e-3, ddpg_scale_reward=1, ddpg_scale_cost=1, offline_itr_n=offline_itr_n, balance=0, safety_tradeoff_coeff_lr=1e-2, ddpg_avg_horizon=offline_itr_n, adjust_epoch=5, ddpg_qf_weight_decay=0., #plot=True, ) algo.train() f.close()