def run_and_test(seed, verbose=False): input_size = 11 output_size = 2 layer_size = 64 num_layers = 2 activation = nn.ReLU policy = MLP(input_size, output_size, num_layers, layer_size, activation) value_fn = MLP(input_size, 1, num_layers, layer_size, activation) model = PPOModel(policy, value_fn, init_logstd=-.5, learn_std=True) agent = PPOAgent(env_name="Reacher-v2", model=model, epoch_batch_size=2048, gamma=.99, seed=int(seed), entropy_coef=0.0, sgd_batch_size=64, lr_schedule=(1e-3,), sgd_epochs=30, target_kl=float('inf'), clip_val=True, env_no_term_steps=50, reward_stop=-2.5, normalize_return=True, normalize_obs=True, normalize_adv=True) t_model, rewards, var_dict = agent.learn(total_steps=2.5e5) if verbose: if var_dict["early_stop"]: print("seed", seed, "achieved 1000 reward in ", len(rewards), "steps") else: print("Error: seed:", seed, "failed") torch.save(var_dict, open("./tmp/" + str(seed), 'wb'), pickle_module=dill) return rewards, var_dict["early_stop"]
def run_and_test(seed, verbose=True): input_size = 4 output_size = 1 layer_size = 16 num_layers = 1 activation = nn.ReLU policy = MLP(input_size, output_size, num_layers, layer_size, activation) value_fn = MLP(input_size, 1, num_layers, layer_size, activation) model = PPOModel(policy, value_fn, init_logstd=-.5, learn_std=True) agent = PPOAgent(env_name="InvertedPendulum-v2", model=model, epoch_batch_size=2048, seed=int(seed), sgd_batch_size=512, lr_schedule=(1e-3, ), sgd_epochs=50, target_kl=.05, env_no_term_steps=1000, reward_stop=1000, normalize_return=False, normalize_adv=True) t_model, rewards, var_dict = agent.learn(total_steps=2e6) if verbose: if var_dict["early_stop"]: print("seed", seed, "achieved 1000 reward in ", len(rewards), "steps") else: print("Error: seed:", seed, "failed") return rewards, var_dict["early_stop"]
def run_and_test(seed, verbose=False): input_size = 26 output_size = 6 layer_size = 64 num_layers = 2 activation = nn.ReLU policy = MLP(input_size, output_size, num_layers, layer_size, activation) value_fn = MLP(input_size, 1, num_layers, layer_size, activation) model = PPOModel(policy, value_fn, init_logstd=-.5, learn_std=True) agent = PPOAgent(env_name="HalfCheetahBulletEnv-v0", model=model, epoch_batch_size=4096, seed=int(seed), entropy_coef=0.0, sgd_batch_size=4096, lr_schedule=[3e-4, 0], sgd_epochs=50, target_kl=.1, clip_val=True, env_no_term_steps=1000, reward_stop=3000, normalize_return=True, normalize_obs=True, normalize_adv=True) t_model, rewards, var_dict = agent.learn(total_steps=5e4) torch.save(var_dict, open("./tmp/" + str(seed), 'wb'), pickle_module=dill) if verbose: if var_dict["early_stop"]: print("seed", seed, "achieved 1000 reward in ", len(rewards), "steps") else: print("Error: seed:", seed, "failed") return rewards, var_dict["early_stop"]
def run_and_test(seed, verbose=True): input_size = 11 output_size = 1 layer_size = 32 num_layers = 1 activation = nn.ReLU policy = MLP(input_size, output_size * 2, num_layers, layer_size, activation) value_fn = MLP(input_size, 1, num_layers, layer_size, activation) model = PPOModel(policy, value_fn, fixed_std=False) agent = PPOAgent(env_name="InvertedDoublePendulum-v2", model=model, epoch_batch_size=2048, seed=int(seed), sgd_batch_size=512, lr_schedule=(1e-3, ), sgd_epochs=50, target_kl=.05, env_no_term_steps=1000, reward_stop=9100.0, normalize_return=False, normalize_adv=True) t_model, rewards, var_dict = agent.learn(1e6) torch.save(var_dict, open("./tmp/" + str(seed), 'wb'), pickle_module=dill) if verbose: if var_dict["early_stop"]: print("seed", seed, "achieved 1000 reward in ", len(rewards), "steps") else: print("Error: seed:", seed, "failed") return rewards, var_dict["early_stop"]
def run_and_test(seed, verbose=True): input_size = 3 output_size = 1 layer_size = 32 num_layers = 2 activation = nn.ReLU policy = MLP(input_size, output_size, num_layers, layer_size, activation) value_fn = MLP(input_size, 1, num_layers, layer_size, activation) model = PPOModel(policy, value_fn, init_logstd=-.5, learn_std=True) # Define our hyper parameters agent = PPOAgent(env_name="Pendulum-v0", model=model, epoch_batch_size=2048, seed=int(seed), sgd_batch_size=64, lr_schedule=(1e-3, ), sgd_epochs=30, target_kl=float('inf'), clip_val=True, reward_stop=-200, normalize_return=True, normalize_obs=True, normalize_adv=True) t_model, rewards, var_dict = agent.learn(total_steps=2e6) if verbose: if var_dict["early_stop"]: print("seed", seed, "achieved 1000 reward in ", len(rewards), "steps") else: print("Error: seed:", seed, "failed") return rewards, var_dict["early_stop"]
def run_and_test(seed, verbose=True): input_size = 4 output_size = 2 layer_size = 32 num_layers = 2 activation = nn.ReLU policy = MLP(input_size, output_size * 2, num_layers, layer_size, activation) value_fn = MLP(input_size, 1, num_layers, layer_size, activation) model = PPOModel(policy, value_fn, action_std=0.1, fixed_std=False) env_name = "linear_z-v0" def reward_fn(s): if s[3] > 0: if s[0] >= 0 and s[2] >= 0: reward = np.clip(np.sqrt(s[0]**2 + s[2]**2), 0, 10) # reward = 5 - np.clip(np.abs(np.sqrt(s[0]**2 + s[2]**2) - 5)**2,0,5) s[3] = -10 else: reward = 0.0 elif s[3] < 0: if s[0] <= 0 and s[2] <= 0: reward = np.clip(np.sqrt(s[0]**2 + s[2]**2), 0, 10) # reward = 5 - np.clip(np.abs(np.sqrt(s[0]**2 + s[2]**2)**2 - 5),0,5) s[3] = 10 else: reward = 0.0 return reward, s num_steps = 500 env_config = { "reward_fn": reward_fn, "xyz_max": float('inf'), "num_steps": num_steps, "act_hold": 10, "integrator": euler, "dt": .01, "init_noise_max": 10, } t_model, rewards, var_dict = ppo(env_name=env_name, model=model, total_steps=2e6, epoch_batch_size=1024, sgd_batch_size=512, lam=.2, gamma=.95, env_config=env_config, sgd_epochs=30, target_kl=.05, reward_stop=150, seed=int(seed)) if var_dict["early_stop"]: print("seed", seed, "achieved 1000 reward in ", len(rewards), "steps") else: print("Error: seed:", seed, "failed") return rewards
import torch.nn as nn from seagul.rl.ppo.ppo2 import PPOAgent from seagul.nn import MLP from seagul.rl.ppo.models import PPOModel import seagul.envs input_size = 1 output_size = 1 layer_size = 32 num_layers = 2 activation = nn.ReLU policy = MLP(input_size, output_size, num_layers, layer_size, activation) value_fn = MLP(input_size, 1, num_layers, layer_size, activation) model = PPOModel(policy, value_fn, init_logstd=-.5, learn_std=True) # Define our hyper parameters agent = PPOAgent(env_name="ProbeEnv1-v0", model=model, epoch_batch_size=2048, seed=0, sgd_batch_size=64, lr_schedule=(1e-3, ), sgd_epochs=30, target_kl=float('inf'), clip_val=True, env_no_term_steps=100, normalize_return=True, normalize_obs=True, normalize_adv=True)
reward = 0.0 return reward, s # if __name__ == "__main__": input_size = 3 output_size = 1 layer_size = 32 num_layers = 2 activation = nn.ReLU policy = MLP(input_size, output_size * 2, num_layers, layer_size, activation) value_fn = MLP(input_size, 1, num_layers, layer_size, activation) model = PPOModel(policy, value_fn, action_std=0.1, fixed_std=False) env_name = "linear_z2d-v0" num_steps = 500 env_config = { "reward_fn": reward_fn, "xz_max": float('inf'), "num_steps": num_steps, "act_hold": 10, "integrator": euler, "dt": .01, "init_noise_max": 10, } alg_config = { "env_name": env_name,