def run_task(*_): env = normalize(Walker2DEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(H_layer_first[h], H_layer_second[h]) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=size_of_batch, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=number_of_episodes, discount=discount_factor, scale_reward=reward_scaling[r], qf_learning_rate=critic_learning_rate[c], policy_learning_rate=actor_learning_rate[c], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def rllab_ddpg_launcher(variant): from rllab.algos.ddpg import DDPG as RllabDDPG from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.q_functions.continuous_mlp_q_function import ( ContinuousMLPQFunction as TheanoContinuousMLPQFunction ) from rllab.policies.deterministic_mlp_policy import ( DeterministicMLPPolicy as TheanoDeterministicMLPPolicy ) from railrl.launchers.launcher_util import get_env_settings env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] policy = TheanoDeterministicMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32) ) es = OUStrategy(env_spec=env.spec) qf = TheanoContinuousMLPQFunction(env_spec=env.spec) algorithm = RllabDDPG( env=env, policy=policy, es=es, qf=qf, **variant['algo_params'] ) algorithm.train()
def run_task(*_): env = normalize( GymEnv(env_name="MountainCarContinuous-v0", force_reset=True)) max_path_length = 300 policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=100, n_updates_per_sample=1, max_path_length=max_path_length, epoch_length=900, min_pool_size=800, replay_pool_size=5000, n_epochs=1000, discount=0.99, scale_reward=0.1, qf_learning_rate=1e-3, policy_learning_rate=1e-4, ) algo.train()
def run_task(*_): env = normalize(CartpoleEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=1000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def main(exp_name, ent_wt=1.0): register_custom_envs() env_name = 'LunarLanderContinuous-v3' env = GymEnv(env_name) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=350, epoch_length=350, min_pool_size=350, n_epochs=600, discount=0.99, scale_reward=1.0/140.0, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) data_path = 'data/%s_data_rllab_%s/%s/'%(env_name.replace('-', '_'), str(algo.__class__.__name__), exp_name) os.makedirs(data_path, exist_ok=True) logger.set_snapshot_dir(data_path) algo.train() logger.set_snapshot_dir(None)
def run_task(*_): env = normalize(SwimmerEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=200, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()
class DDPGModel(Model): def __init__(self): self.ddpg = DDPG() def predict(self, obs): action = self.ddpg.policy.get_action(observation=obs) def train(self, batch_data): self.ddpg.train(batch_data=)
def run_task(*_): """ DPG on Hopper environment """ env = normalize(HopperEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(400, 300)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) """ Using the DDPG algorithm """ # algo = DDPG( # env=env, # policy=policy, # es=es, # qf=qf, # batch_size=32, # max_path_length=500, # epoch_length=500, # min_pool_size=10000, # n_epochs=20000, # discount=0.99, # scale_reward=0.01, # qf_learning_rate=1e-3, # policy_learning_rate=1e-4, # #Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, # ) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=64, max_path_length=1000, epoch_length=1000, min_pool_size=10000, n_epochs=10000, discount=0.99, scale_reward=0.01, qf_learning_rate=10e-3, policy_learning_rate=10e-4, #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()
def run_task(*_): """ DPG on Swimmer environment """ env = normalize(SwimmerEnv()) """ Initialise the policy as a neural network policy """ policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) """ Defining exploration strategy : OUStrategy - """ """ This strategy implements the Ornstein-Uhlenbeck process, which adds time-correlated noise to the actions taken by the deterministic policy. The OU process satisfies the following stochastic differential equation: dxt = theta*(mu - xt)*dt + sigma*dWt where Wt denotes the Wiener process """ es = OUStrategy(env_spec=env.spec) """ Defining the Q network """ qf = ContinuousMLPQFunction(env_spec=env.spec) """ Using the DDPG algorithm """ algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=500, epoch_length=500, min_pool_size=10000, n_epochs=20000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, #Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) """ Training the networks based on the DDPG algorithm """ algo.train()
def test_ddpg(): env = CartpoleEnv() policy = DeterministicMLPPolicy(env.spec) qf = ContinuousMLPQFunction(env.spec) es = OUStrategy(env.spec) algo = DDPG( env=env, policy=policy, qf=qf, es=es, n_epochs=1, epoch_length=100, batch_size=32, min_pool_size=50, replay_pool_size=1000, eval_samples=100, ) algo.train()
def test_ddpg(): env = CartpoleEnv() policy = DeterministicMLPPolicy(env.spec) qf = ContinuousMLPQFunction(env.spec) es = OUStrategy(env.spec) algo = DDPG( env=env, policy=policy, qf=qf, es=es, n_epochs=1, epoch_length=100, batch_size=32, min_pool_size=50, replay_pool_size=1000, eval_samples=100, ) algo.train()
def test_rllab(patient_id=1, Initial_Bg=0): try: from rllab.algos.ddpg import DDPG from rllab.envs.normalized_env import normalize from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction from rllab.envs.gym_env import GymEnv except ImportError: print('rllab is not installed!') return None env = GymEnv('simglucose-adult{}-CHO{}-v0'.format(Initial_Bg, patient_id + 1)) env = normalize(env) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each # with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG(env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=5, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4) algo.train() # env.close() return es, policy
def run_task(*_): # env = normalize(HalfCheetahEnv()) env = normalize(GymEnv(env_name = "LunarLanderContinuous-v2",force_reset=True)) # env = normalize(GymEnv(env_name="BipedalWalker-v2", force_reset=True, record_video=True)) max_path_length = 400 # print("env.horizon: ",env.horizon) # input() # env._max_episode_steps = max_path_length policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64) ) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=max_path_length, train_epoch_interval=300, min_pool_size=500, replay_pool_size = 10000, n_updates_per_sample =1, n_steps = 75000, discount=0.99, scale_reward=0.1, qf_learning_rate=1e-2, policy_learning_rate=1e-3, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(GymEnv(args.env, force_reset=True, record_video=False)) env.wrapped_env.env.env.reward_flag = args.reward if args.hidden_sizes == 0: hidden_sizes=(8,) elif args.hidden_sizes == 1: hidden_sizes=(32, 32) elif args.hidden_sizes == 2: hidden_sizes=(100, 50, 25) elif args.hidden_sizes == 3: hidden_sizes=(400, 300) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_sizes ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=64, max_path_length=95, epoch_length=args.batch_size, min_pool_size=10000, n_epochs=args.n_itr, discount=args.gamma, scale_reward=args.scale_reward, qf_learning_rate=1e-3, policy_learning_rate=1e-4, eval_samples=95, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
qf_learning_rate=1e-3, # Learning rate for training Q function policy_learning_rate=1e-4, # Learning rate for training the policy #qf_weight_decay=0.01, soft_target_tau= 0.005, # Interpolation parameter for doing the soft target update. # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) log_dir = os.path.join(os.getcwd(), 'data') logger.set_snapshot_dir(log_dir) logger.add_text_output(os.path.join(log_dir, 'debug.log')) logger.add_tabular_output(os.path.join(log_dir, 'progress.csv')) logger.set_snapshot_mode('last') algo.train() # save parameters with open(os.path.join(log_dir, 'final_policy.pkl'), 'wb') as output: trained_policy = algo.policy pickle.dump(trained_policy, output, pickle.HIGHEST_PROTOCOL) print('Final policy saved') def save_large_pickled_object(obj, filepath): """ This is a defensive way to write pickle.write, allowing for very large files on all platforms """ max_bytes = 2**31 - 1 bytes_out = pickle.dumps(obj) n_bytes = sys.getsizeof(bytes_out)
algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=1000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=1, # plot=True, )
# The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 128)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=500, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=1, use_gpu=True, # plot=True, )