num_layers = 2 activation = nn.ReLU base_dir = "/data/mj_pend/" trial_name = input("Trial name: ") trial_dir = base_dir + trial_name + "/" base_ok = input("run will be saved in " + trial_dir + " ok? y/n") if base_ok == "n": exit() start = time.time() for seed in np.random.randint(0, 2**32, 8): model = SACModel(policy=MLP(input_size, output_size * 2, num_layers, layer_size, activation), value_fn=MLP(input_size, 1, num_layers, layer_size, activation), q1_fn=MLP(input_size + output_size, 1, num_layers, layer_size, activation), q2_fn=MLP(input_size + output_size, 1, num_layers, layer_size, activation), act_limit=3) alg_config = { "env_name": env_name, "model": model, "seed": int(seed), # int((time.time() % 1)*1e8), "total_steps": 1e6, "alpha": .2, "exploration_steps": 5000,
return top_agents, reward_log, agent_log if __name__ == "__main__": torch.set_default_dtype(torch.float64) import matplotlib.pyplot as plt from seagul.nn import MLP from seagul.rl.ars.ars_pipe2 import ARSAgent env_name = "Walker2d-v2" env = gym.make(env_name) in_size = env.observation_space.shape[0] out_size = env.action_space.shape[0] policy = MLP(in_size, out_size, 0, 0, bias=False) import time start = time.time() init_agent = ARSAgent(env_name, policy, seed=0, n_workers=12, n_delta=32, n_top=16) meta_agent = MetaArsAgent(0, init_agent, n_seeds=8, n_top_seeds=1, mean_lookback=3, ars_epochs=25)
from seagul.nn import MLP import torch base = MLP(4, 32, 2, 32) b1 = MLP(16, 2, 16, 2) b2 = MLP(16, 2, 16, 2) x0 = torch.randn(1, 4) x1 = base(x0) xl = b1(x1[..., :16]) xr = b2(x1[..., 16:])
def control(q): k = np.array([[-1649.86567367, -460.15780461, -716.07110032, -278.15312267]]) gs = np.array([pi / 2, 0, 0, 0]) return -k.dot(q - gs) def reward_fn(s, a): reward = 1e-2*(np.sin(s[0]) + np.sin(s[0] + s[1])) return reward, False for seed in np.random.randint(0, 2**32, 4): for act_var in [.5, 1.0, 3.0]: max_t = 10.0 model = SwitchedPPOModelActHold( # policy = MLP(input_size, output_size, num_layers, layer_size, activation), policy=MLP(input_size, output_size, layer_size, num_layers), value_fn=MLP(input_size, output_size, layer_size, num_layers), gate_fn=torch.load("warm/lqr_gate_better"), nominal_policy=control, hold_count=20, thresh=.9, ) env_config = { "init_state": [-pi/2, 0, 0, 0], "max_torque": max_torque, "init_state_weights": [0, 0, 0, 0], "dt": .01, "reward_fn" : reward_fn, "max_t" : max_t, "m2": m2,
from seagul.rl.algos import sac from seagul.nn import MLP from seagul.rl.models import SACModel, PPOModel import ray input_size = 3 output_size = 1 layer_size = 64 num_layers = 2 ray.init() policy = MLP(input_size, output_size * 2, num_layers, layer_size) value_fn = MLP(input_size, 1, num_layers, layer_size) q1_fn = MLP(input_size + output_size, 1, num_layers, layer_size) q2_fn = MLP(input_size + output_size, 1, num_layers, layer_size) model = SACModel(policy, value_fn, q1_fn, q2_fn, 3) ppo_policy = MLP(input_size, output_size, num_layers, layer_size) ppo_model = PPOModel(ppo_policy, value_fn) env_name = "Pendulum-v0" #model, rews, var_dict = ray_sac(env_name, 20000, model, env_steps=0, iters_per_update=100, min_steps_per_update=100, reward_stop=-200, exploration_steps=100) #model, rews, var_dict = ppo(env_name, 3e5, ppo_model) model, rews, var_dict = sac(env_name, 160000, model, seed=0, env_steps=0, iters_per_update=100,
def run_and_save(arg): seed, save_dir = arg trial_dir = save_dir + "/" + "seed" + str(seed) + "/" # init policy, value fn input_size = 10 output_size = 3 layer_size = 32 num_layers = 2 activation = nn.ReLU env_name = "bball3_mj-v0" num_steps = int(1e6) policy = MLP(input_size, output_size * 2, num_layers, layer_size, activation) value_fn = MLP(input_size, 1, num_layers, layer_size, activation) q1_fn = MLP(input_size + output_size, 1, num_layers, layer_size, activation) q2_fn = MLP(input_size + output_size, 1, num_layers, layer_size, activation) # env_config = { # 'init_state': (0, 0, -pi / 2, .15, .75, 0, 0, 0, 0, 0), # 'reward_fn': reward_fn, # 'max_torque': 5.0, # 'max_steps': 50 # } env_config = {} model = SACModel( policy=policy, value_fn=value_fn, q1_fn=q1_fn, q2_fn=q2_fn, act_limit=5, ) alg_config = { "env_name": env_name, "model": model, "alpha": .02, "env_max_steps": 500, "seed": int(seed), "exploration_steps": 1000, "min_steps_per_update": 500, "gamma": 1, "sgd_batch_size": 128, "replay_batch_size": 512, #"iters_per_update": 16, "iters_per_update": float('inf'), "env_config": env_config } agent = SACAgent(**alg_config) agent.learn(num_steps) os.makedirs(trial_dir, exist_ok=False) with open(trial_dir + "agent.ag", "wb") as agent_file: torch.save(agent, agent_file) with open(trial_dir + "config.pkl", "wb") as config_file: pickle.dump(env_config, config_file) with open(trial_dir + "reward_fn.py", 'w') as f: f.write(inspect.getsource(reward_fn))
def check_sac_model(model, obs, act_size, val_size): act, val, _, logp = model.step(obs) assert (act.shape == act_size) assert (val.shape == val_size) assert (logp.shape == act_size) act, logp = model.select_action(obs, torch.ones(act_size)) assert (act.shape == act_size) # Single output MLP size check # ================================================================= net = MLP(4,1,2,12) obs = np.zeros(4, dtype=dtype) assert net(obs).shape == torch.Size([1]) obs = np.zeros((1,4), dtype=dtype) assert net(obs).shape == torch.Size([1,1]) obs = np.zeros((100,4), dtype=dtype) assert net(obs).shape == torch.Size([100,1]) print("Single output MLP good") # Multiple output MLP size check # ================================================================= net = MLP(4,4,2,12)
from seagul.rl.algos import ppo from seagul.nn import MLP from seagul.rl.models import PPOModel import torch torch.set_default_dtype(torch.double)# TODO need to update everything to support arbitrary dtypes input_size = 3 output_size = 1 layer_size = 64 num_layers = 2 policy = MLP(input_size, output_size, num_layers, layer_size) value_fn = MLP(input_size, 1, num_layers, layer_size) model = PPOModel(policy, value_fn) model, rews, var_dict = ppo("Pendulum-v0", 10000, model)
import time # init policy, valuefn input_size = 4 output_size = 1 layer_size = 0 num_layers = 0 activation = nn.ReLU proc_list = [] for seed in range(200)[-7:]: # policy = MLP(input_size, output_size, num_layers, layer_size, activation) policy = torch.load("warm/LQR_policy") value_fn = MLP(input_size, 1, num_layers, layer_size, activation) model = PPOModel( policy=policy, value_fn = value_fn, discrete=False, # hold_count = 0 ) def reward_fn(ns, act): return -1e-2*((ns[0] - np.pi)**2 + ns[1]**2 + .1*ns[2]**2 + .2*ns[3]**2) #return 1e-2*(np.cos(ns[0]) + np.cos(ns[0] + ns[1])) env_config = { "max_torque" : 25, "init_state" : [0.0, 0.0, 0.0, 0.0], "init_state_weights" : np.array([.1, .3, .1, .3]),
from seagul.rl.ars.ars_pool import ars from seagul.nn import MLP import torch import matplotlib.pyplot as plt import gym.envs.mujoco.reacher torch.set_default_dtype(torch.float64) net = MLP(17, 6, 64, 2) net, r = ars("HalfCheetah-v2", net, 100) plt.plot(r) plt.show()
for i, ival in enumerate(results[:, :]): for j, val in enumerate(ival): if val: plt.plot(th1dot_vals[i], th2dot_vals[j], 'o', color='black') # In[ ]: # #import pickle # pickle.dump(X, open('./warm/X_zv_128', 'wb')) # pickle.dump(Y, open('./warm/Y_zv_128', 'wb')) # In[ ]: from seagul.nn import MLP pol = MLP(4, 1, 0, 0, input_bias=True) d = pol.state_dict() d['output_layer.weight'] = torch.tensor([[1316.85, 555.42, 570.33, 272.58]], dtype=torch.float32) d['output_layer.bias'] = torch.tensor([0.0], dtype=torch.float32) d['input_bias'] = torch.tensor([-np.pi, 0.0, 0.0, 0.0], dtype=torch.float32) pol.load_state_dict(d) torch.save(pol, 'warm/LQR_policy') # In[65]: def do_rollout(trial_num): np.random.seed(trial_num) act_hold = 20 hold_count = 0
out_size = env.action_space.shape[0] policy_dict = {fn.__name__:[] for fn in post_fns} rewards = xr.DataArray(np.zeros((num_experiments, num_seeds, num_epochs)), dims = ("post", "trial", "epoch"), coords = {"post": [fn.__name__ for fn in post_fns]}) post_rewards = xr.DataArray(np.zeros((num_experiments, num_seeds, num_epochs)), dims = ("post", "trial", "epoch"), coords = {"post": [fn.__name__ for fn in post_fns]}) data = xr.Dataset( {"rews" : rewards, "post_rews" : post_rewards}, coords = {"post": [fn.__name__ for fn in post_fns]}, attrs = {"policy_dict":policy_dict, "post_fns":post_fns, "env_name":env_name, "hyperparams":{"num_experiments":num_experiments, "num_seeds":num_seeds, "num_epochs":num_epochs, "n_workers":n_workers, "n_delta":n_delta, "n_top":n_top, "exp_noise":exp_noise}, "env_config":env_config}) for post_fn in post_fns: for i in range(num_seeds): policy = MLP(in_size,out_size,0,0) policy, r_hist, lr_hist = ars(env_name, policy, num_epochs, n_workers=n_workers, n_delta=n_delta, n_top=n_top, exp_noise=exp_noise, postprocess=post_fn, env_config=env_config) print(f"{env_name}, {post_fn.__name__}, {i}, {time.time() - start}") data.policy_dict[post_fn.__name__].append(copy.deepcopy(policy)) data.rews.loc[post_fn.__name__,i,:] = lr_hist data.post_rews.loc[post_fn.__name__,i,:] = r_hist torch.save(data, f"{save_dir}{env_name}.xr")
samples += np.array([th1_max, th2_max, th1dot_max, th2dot_max]) total_steps = 0 for i, res in enumerate( pool.imap(do_rollout, zip(samples, range(int(num_trials / 2), int(num_trials))))): y, steps = res total_steps += steps X[i + int(num_trials / 2), :] = samples[i, :] Y[i + int(num_trials / 2)] = y print("Supervised data generated") # In[4]: net = MLP(4, 1, 2, 32) w = 1e-2 # Weighting parameter to encourage learning a conservative region of attraction class_weight = torch.tensor(Y.shape[0] / sum(Y) * w, dtype=torch.float32) # This is just doing some pretty standard supervised learning, the source is available in seagul loss_hist = fit_model( net, X, Y, 50, batch_size=2048, loss_fn=torch.nn.BCEWithLogitsLoss(pos_weight=class_weight)) plt.close() plt.plot(loss_hist)
from seagul.rl.models import SACModel import seagul.envs import torch import torch.nn as nn import time import gym input_size = 4 output_size = 1 layer_size = 256 num_layers = 2 activation = nn.ReLU device = 'cpu' policy = MLP(input_size, output_size * 2, num_layers, layer_size, activation).to(device) value_fn = MLP(input_size, 1, num_layers, layer_size, activation).to(device) q1_fn = MLP(input_size + output_size, 1, num_layers, layer_size, activation).to(device) q2_fn = MLP(input_size + output_size, 1, num_layers, layer_size, activation).to(device) model = SACModel(policy, value_fn, q1_fn, q2_fn, 25) env = gym.make('su_acrobot-v0') def do_rollout(env, model, num_steps): acts_list = [] obs1_list = [] obs2_list = [] rews_list = [] done_list = [] dtype = torch.float32
done = True return reward, done env_config1 = { "init_state": [0, 0, 0, 0], "max_torque": max_torque, "init_state_weights": [0, 0, 0, 0], "dt": .01, "reward_fn": reward_fn, "max_t": max_t, "act_hold": 20 } policy = MLP(input_size, output_size, num_layers, layer_size, activation) value_fn = MLP(input_size, 1, num_layers, layer_size, activation) model1 = PPOModel(policy=policy, value_fn=value_fn, action_var=1) env_config2 = { "init_state": [0, 0, 0, 0], "max_torque": max_torque, "init_state_weights": [0, 0, 0, 0], "dt": .01, "reward_fn": reward_fn, "max_t": max_t, "act_hold": 1 } model2 = PPOModelActHold(policy=policy, value_fn=value_fn, action_var=1,
from seagul.nn import MLP, fit_model import torch import torch.nn as nn import time net1 = nn.Sequential(nn.Linear(4, 32), nn.ReLU(), nn.Linear(32, 32), nn.ReLU(), nn.Linear(32, 1)) net2 = MLP(4, 1, num_layers=2, layer_size=32) X = torch.rand(40960, 4) Y = torch.rand(40960, 1) start = time.time() fit_model(net1, X, Y, num_epochs=10) print(time.time() - start) start = time.time() fit_model(net2, X, Y, num_epochs=10) print(time.time() - start)
for i, res in enumerate(pool.imap(do_rollout, zip(samples, range(int(num_trials/2), int(num_trials))))): rews, steps = res reward_hist[i, :, :] = rews total_steps += steps X[i+int(num_trials/2), :] = samples[i, :] Y[i+int(num_trials/2)] = sum(rews) > env.num_steps*3 - 5 print(time.time() - start) # %% from seagul.nn import MLP, fit_model import torch net = MLP(4, 1, 2, 32) # output_activation=torch.nn.Softmax) Y0 = np.ones((num_trials, 1), dtype=np.float32) w = 1e-2 class_weight = torch.tensor(Y.shape[0]/sum(Y)*w, dtype=torch.float32) loss_hist = fit_model(net, X, Y, 50, batch_size=2048, loss_fn=torch.nn.BCEWithLogitsLoss(pos_weight=class_weight)) #loss_hist = fit_model(net, X, Y, 50, batch_size=2048, loss_fn=torch.nn.BCEWithLogitsLoss()) # loss_hist = fit_model(net, X, Y, 100, batch_size=2048) # loss_hist = fit_model(net, X, Y0, 5, batch_size=2048, loss_fn=torch.nn.BCEWithLogitsLoss(pos_weight=class_weight)) plt.close() plt.plot(loss_hist) plt.show()
else: print("Error: seed:", seed, "failed") print("Rewards were", rewards[-1]) start = time.time() for seed in np.random.randint(0, 2 ** 32, 8): # init policy, value fn input_size = 4 output_size = 1 layer_size = 16 num_layers = 1 activation = nn.ReLU model = TD3Model( policy = MLP(input_size, output_size, num_layers, layer_size, activation), q1_fn = MLP(input_size+output_size, 1, num_layers, layer_size, activation), q2_fn = MLP(input_size+output_size, 1, num_layers, layer_size, activation), act_limit=3 ) alg_config = { "env_name": env_name, "model": model, "seed": int(seed), # int((time.time() % 1)*1e8), "train_steps" : 5e5, "exploration_steps": 500, "reward_stop": 1000, "gamma": .95, "act_std_schedule": (1, .1,), "replay_batch_size": 128,