Example #1
0
num_layers = 2
activation = nn.ReLU

base_dir = "/data/mj_pend/"
trial_name = input("Trial name: ")

trial_dir = base_dir + trial_name + "/"
base_ok = input("run will be saved in " + trial_dir + " ok? y/n")

if base_ok == "n":
    exit()

start = time.time()
for seed in np.random.randint(0, 2**32, 8):

    model = SACModel(policy=MLP(input_size, output_size * 2, num_layers,
                                layer_size, activation),
                     value_fn=MLP(input_size, 1, num_layers, layer_size,
                                  activation),
                     q1_fn=MLP(input_size + output_size, 1, num_layers,
                               layer_size, activation),
                     q2_fn=MLP(input_size + output_size, 1, num_layers,
                               layer_size, activation),
                     act_limit=3)

    alg_config = {
        "env_name": env_name,
        "model": model,
        "seed": int(seed),  # int((time.time() % 1)*1e8),
        "total_steps": 1e6,
        "alpha": .2,
        "exploration_steps": 5000,
Example #2
0
        return top_agents, reward_log, agent_log


if __name__ == "__main__":
    torch.set_default_dtype(torch.float64)
    import matplotlib.pyplot as plt
    from seagul.nn import MLP
    from seagul.rl.ars.ars_pipe2 import ARSAgent

    env_name = "Walker2d-v2"
    env = gym.make(env_name)
    in_size = env.observation_space.shape[0]
    out_size = env.action_space.shape[0]

    policy = MLP(in_size, out_size, 0, 0, bias=False)

    import time
    start = time.time()
    init_agent = ARSAgent(env_name,
                          policy,
                          seed=0,
                          n_workers=12,
                          n_delta=32,
                          n_top=16)
    meta_agent = MetaArsAgent(0,
                              init_agent,
                              n_seeds=8,
                              n_top_seeds=1,
                              mean_lookback=3,
                              ars_epochs=25)
Example #3
0
from seagul.nn import MLP
import torch

base = MLP(4, 32, 2, 32)
b1 = MLP(16, 2, 16, 2)
b2 = MLP(16, 2, 16, 2)

x0 = torch.randn(1, 4)
x1 = base(x0)

xl = b1(x1[..., :16])
xr = b2(x1[..., 16:])
Example #4
0
def control(q):
    k = np.array([[-1649.86567367, -460.15780461, -716.07110032, -278.15312267]])
    gs = np.array([pi / 2, 0, 0, 0])
    return -k.dot(q - gs)

def reward_fn(s, a):
    reward = 1e-2*(np.sin(s[0]) + np.sin(s[0] + s[1]))
    return reward, False

for seed in np.random.randint(0, 2**32, 4):
    for act_var in [.5, 1.0, 3.0]:
        max_t = 10.0

        model = SwitchedPPOModelActHold(
            # policy = MLP(input_size, output_size, num_layers, layer_size, activation),
            policy=MLP(input_size, output_size, layer_size, num_layers),
            value_fn=MLP(input_size, output_size, layer_size, num_layers),
            gate_fn=torch.load("warm/lqr_gate_better"),
            nominal_policy=control,
            hold_count=20,
            thresh=.9,
        )

        env_config = {
            "init_state": [-pi/2, 0, 0, 0],
            "max_torque": max_torque,
            "init_state_weights": [0, 0, 0, 0],
            "dt": .01,
            "reward_fn" : reward_fn,
            "max_t" : max_t,
            "m2": m2,
Example #5
0
from seagul.rl.algos import sac
from seagul.nn import MLP
from seagul.rl.models import SACModel, PPOModel
import ray

input_size = 3
output_size = 1
layer_size = 64
num_layers = 2

ray.init()

policy = MLP(input_size, output_size * 2, num_layers, layer_size)
value_fn = MLP(input_size, 1, num_layers, layer_size)
q1_fn = MLP(input_size + output_size, 1, num_layers, layer_size)
q2_fn = MLP(input_size + output_size, 1, num_layers, layer_size)
model = SACModel(policy, value_fn, q1_fn, q2_fn, 3)

ppo_policy = MLP(input_size, output_size, num_layers, layer_size)
ppo_model = PPOModel(ppo_policy, value_fn)

env_name = "Pendulum-v0"
#model, rews, var_dict = ray_sac(env_name, 20000, model, env_steps=0, iters_per_update=100, min_steps_per_update=100, reward_stop=-200, exploration_steps=100)
#model, rews, var_dict = ppo(env_name, 3e5, ppo_model)

model, rews, var_dict = sac(env_name,
                            160000,
                            model,
                            seed=0,
                            env_steps=0,
                            iters_per_update=100,
Example #6
0
def run_and_save(arg):
    seed, save_dir = arg
    trial_dir = save_dir + "/" + "seed" + str(seed) + "/"

    # init policy, value fn
    input_size = 10
    output_size = 3
    layer_size = 32
    num_layers = 2
    activation = nn.ReLU
    env_name = "bball3_mj-v0"
    num_steps = int(1e6)

    policy = MLP(input_size, output_size * 2, num_layers, layer_size,
                 activation)
    value_fn = MLP(input_size, 1, num_layers, layer_size, activation)
    q1_fn = MLP(input_size + output_size, 1, num_layers, layer_size,
                activation)
    q2_fn = MLP(input_size + output_size, 1, num_layers, layer_size,
                activation)

    # env_config = {
    #     'init_state': (0, 0, -pi / 2, .15, .75, 0, 0, 0, 0, 0),
    #     'reward_fn': reward_fn,
    #     'max_torque': 5.0,
    #     'max_steps': 50
    # }
    env_config = {}

    model = SACModel(
        policy=policy,
        value_fn=value_fn,
        q1_fn=q1_fn,
        q2_fn=q2_fn,
        act_limit=5,
    )

    alg_config = {
        "env_name": env_name,
        "model": model,
        "alpha": .02,
        "env_max_steps": 500,
        "seed": int(seed),
        "exploration_steps": 1000,
        "min_steps_per_update": 500,
        "gamma": 1,
        "sgd_batch_size": 128,
        "replay_batch_size": 512,
        #"iters_per_update": 16,
        "iters_per_update": float('inf'),
        "env_config": env_config
    }

    agent = SACAgent(**alg_config)
    agent.learn(num_steps)

    os.makedirs(trial_dir, exist_ok=False)

    with open(trial_dir + "agent.ag", "wb") as agent_file:
        torch.save(agent, agent_file)

    with open(trial_dir + "config.pkl", "wb") as config_file:
        pickle.dump(env_config, config_file)

    with open(trial_dir + "reward_fn.py", 'w') as f:
        f.write(inspect.getsource(reward_fn))
Example #7
0

def check_sac_model(model, obs, act_size, val_size):

    act, val, _, logp = model.step(obs)
    assert (act.shape == act_size)
    assert (val.shape == val_size)
    assert (logp.shape == act_size)

    act, logp = model.select_action(obs, torch.ones(act_size))
    assert (act.shape == act_size)


# Single output MLP size check
# =================================================================
net = MLP(4,1,2,12)
obs = np.zeros(4, dtype=dtype)
assert net(obs).shape == torch.Size([1])


obs = np.zeros((1,4), dtype=dtype)
assert net(obs).shape == torch.Size([1,1])

obs = np.zeros((100,4), dtype=dtype)
assert net(obs).shape == torch.Size([100,1])

print("Single output MLP good")

# Multiple output MLP size check
# =================================================================
net = MLP(4,4,2,12)
Example #8
0
from seagul.rl.algos import ppo
from seagul.nn import MLP
from seagul.rl.models import PPOModel
import torch

torch.set_default_dtype(torch.double)# TODO need to update everything to support arbitrary dtypes

input_size = 3
output_size = 1
layer_size = 64
num_layers = 2

policy = MLP(input_size, output_size, num_layers, layer_size)
value_fn = MLP(input_size, 1, num_layers, layer_size)
model = PPOModel(policy, value_fn)

model, rews, var_dict = ppo("Pendulum-v0", 10000, model)
Example #9
0
import time

# init policy, valuefn
input_size = 4
output_size = 1
layer_size = 0
num_layers = 0
activation = nn.ReLU


proc_list = []

for seed in range(200)[-7:]:
    #        policy = MLP(input_size, output_size, num_layers, layer_size, activation)
    policy = torch.load("warm/LQR_policy")
    value_fn = MLP(input_size, 1, num_layers, layer_size, activation)
    model = PPOModel(
            policy=policy,
            value_fn = value_fn,
            discrete=False,
#            hold_count = 0
        )

    def reward_fn(ns, act):
        return -1e-2*((ns[0] - np.pi)**2 + ns[1]**2 + .1*ns[2]**2 + .2*ns[3]**2)
    #return 1e-2*(np.cos(ns[0]) + np.cos(ns[0] + ns[1]))
    
    env_config = {
        "max_torque" : 25,
        "init_state" : [0.0, 0.0, 0.0, 0.0],
        "init_state_weights" : np.array([.1, .3, .1, .3]),
Example #10
0
from seagul.rl.ars.ars_pool import ars
from seagul.nn import MLP
import torch
import matplotlib.pyplot as plt
import gym.envs.mujoco.reacher

torch.set_default_dtype(torch.float64)

net = MLP(17, 6, 64, 2)

net, r = ars("HalfCheetah-v2", net, 100)
plt.plot(r)
plt.show()
Example #11
0
for i, ival in enumerate(results[:, :]):
    for j, val in enumerate(ival):
        if val:
            plt.plot(th1dot_vals[i], th2dot_vals[j], 'o', color='black')

# In[ ]:
#

#import pickle
# pickle.dump(X, open('./warm/X_zv_128', 'wb'))
# pickle.dump(Y, open('./warm/Y_zv_128', 'wb'))

# In[ ]:

from seagul.nn import MLP
pol = MLP(4, 1, 0, 0, input_bias=True)
d = pol.state_dict()
d['output_layer.weight'] = torch.tensor([[1316.85, 555.42, 570.33, 272.58]],
                                        dtype=torch.float32)
d['output_layer.bias'] = torch.tensor([0.0], dtype=torch.float32)
d['input_bias'] = torch.tensor([-np.pi, 0.0, 0.0, 0.0], dtype=torch.float32)
pol.load_state_dict(d)
torch.save(pol, 'warm/LQR_policy')

# In[65]:


def do_rollout(trial_num):
    np.random.seed(trial_num)
    act_hold = 20
    hold_count = 0
Example #12
0
        out_size = env.action_space.shape[0]
        policy_dict =  {fn.__name__:[] for fn in post_fns}
        
        rewards = xr.DataArray(np.zeros((num_experiments, num_seeds, num_epochs)),
                    dims = ("post", "trial", "epoch"),
                    coords = {"post": [fn.__name__ for fn in post_fns]})

        post_rewards = xr.DataArray(np.zeros((num_experiments, num_seeds, num_epochs)),
                    dims = ("post", "trial", "epoch"),
                    coords = {"post": [fn.__name__ for fn in post_fns]})

        data = xr.Dataset(
            {"rews" : rewards,
            "post_rews" : post_rewards},
            coords = {"post": [fn.__name__ for fn in post_fns]},
            attrs  = {"policy_dict":policy_dict, "post_fns":post_fns, "env_name":env_name,
                      "hyperparams":{"num_experiments":num_experiments, "num_seeds":num_seeds, "num_epochs":num_epochs, "n_workers":n_workers, "n_delta":n_delta, "n_top":n_top, "exp_noise":exp_noise},
                      "env_config":env_config})             
            

        for post_fn in post_fns:
            for i in range(num_seeds):
                policy = MLP(in_size,out_size,0,0)
                policy, r_hist, lr_hist = ars(env_name, policy, num_epochs, n_workers=n_workers, n_delta=n_delta, n_top=n_top, exp_noise=exp_noise, postprocess=post_fn, env_config=env_config)
                print(f"{env_name}, {post_fn.__name__}, {i}, {time.time() - start}")
                data.policy_dict[post_fn.__name__].append(copy.deepcopy(policy))
                data.rews.loc[post_fn.__name__,i,:] = lr_hist
                data.post_rews.loc[post_fn.__name__,i,:] = r_hist

        torch.save(data, f"{save_dir}{env_name}.xr")
Example #13
0
samples += np.array([th1_max, th2_max, th1dot_max, th2dot_max])
total_steps = 0

for i, res in enumerate(
        pool.imap(do_rollout,
                  zip(samples, range(int(num_trials / 2), int(num_trials))))):
    y, steps = res
    total_steps += steps
    X[i + int(num_trials / 2), :] = samples[i, :]
    Y[i + int(num_trials / 2)] = y

print("Supervised data generated")

# In[4]:

net = MLP(4, 1, 2, 32)

w = 1e-2  # Weighting parameter to encourage learning a conservative region of attraction
class_weight = torch.tensor(Y.shape[0] / sum(Y) * w, dtype=torch.float32)
# This is just doing some pretty standard supervised learning, the source is available in seagul
loss_hist = fit_model(
    net,
    X,
    Y,
    50,
    batch_size=2048,
    loss_fn=torch.nn.BCEWithLogitsLoss(pos_weight=class_weight))

plt.close()
plt.plot(loss_hist)
Example #14
0
from seagul.rl.models import SACModel
import seagul.envs
import torch
import torch.nn as nn
import time
import gym

input_size = 4
output_size = 1
layer_size = 256
num_layers = 2
activation = nn.ReLU

device = 'cpu'

policy = MLP(input_size, output_size * 2, num_layers, layer_size, activation).to(device)
value_fn = MLP(input_size, 1, num_layers, layer_size, activation).to(device)
q1_fn = MLP(input_size + output_size, 1, num_layers, layer_size, activation).to(device)
q2_fn = MLP(input_size + output_size, 1, num_layers, layer_size, activation).to(device)
model = SACModel(policy, value_fn, q1_fn, q2_fn, 25)

env = gym.make('su_acrobot-v0')

def do_rollout(env, model, num_steps):
    acts_list = []
    obs1_list = []
    obs2_list = []
    rews_list = []
    done_list = []

    dtype = torch.float32
Example #15
0
        done = True

    return reward, done


env_config1 = {
    "init_state": [0, 0, 0, 0],
    "max_torque": max_torque,
    "init_state_weights": [0, 0, 0, 0],
    "dt": .01,
    "reward_fn": reward_fn,
    "max_t": max_t,
    "act_hold": 20
}

policy = MLP(input_size, output_size, num_layers, layer_size, activation)
value_fn = MLP(input_size, 1, num_layers, layer_size, activation)
model1 = PPOModel(policy=policy, value_fn=value_fn, action_var=1)
env_config2 = {
    "init_state": [0, 0, 0, 0],
    "max_torque": max_torque,
    "init_state_weights": [0, 0, 0, 0],
    "dt": .01,
    "reward_fn": reward_fn,
    "max_t": max_t,
    "act_hold": 1
}

model2 = PPOModelActHold(policy=policy,
                         value_fn=value_fn,
                         action_var=1,
Example #16
0
from seagul.nn import MLP, fit_model
import torch
import torch.nn as nn
import time

net1 = nn.Sequential(nn.Linear(4, 32), nn.ReLU(), nn.Linear(32, 32), nn.ReLU(),
                     nn.Linear(32, 1))

net2 = MLP(4, 1, num_layers=2, layer_size=32)

X = torch.rand(40960, 4)
Y = torch.rand(40960, 1)

start = time.time()
fit_model(net1, X, Y, num_epochs=10)
print(time.time() - start)

start = time.time()
fit_model(net2, X, Y, num_epochs=10)
print(time.time() - start)
Example #17
0
for i, res in enumerate(pool.imap(do_rollout, zip(samples, range(int(num_trials/2), int(num_trials))))):
    rews, steps = res
    reward_hist[i, :, :] = rews
    total_steps += steps
    X[i+int(num_trials/2), :] = samples[i, :]
    Y[i+int(num_trials/2)] = sum(rews) > env.num_steps*3 - 5


print(time.time() - start)

# %%
from seagul.nn import MLP, fit_model
import torch

net = MLP(4, 1, 2, 32)  # output_activation=torch.nn.Softmax)
Y0 = np.ones((num_trials, 1), dtype=np.float32)

w = 1e-2
class_weight = torch.tensor(Y.shape[0]/sum(Y)*w, dtype=torch.float32)

loss_hist = fit_model(net, X, Y, 50, batch_size=2048, loss_fn=torch.nn.BCEWithLogitsLoss(pos_weight=class_weight))
#loss_hist = fit_model(net, X, Y, 50, batch_size=2048, loss_fn=torch.nn.BCEWithLogitsLoss())

# loss_hist = fit_model(net, X, Y, 100, batch_size=2048)
# loss_hist = fit_model(net, X, Y0, 5, batch_size=2048, loss_fn=torch.nn.BCEWithLogitsLoss(pos_weight=class_weight))

plt.close()
plt.plot(loss_hist)
plt.show()
Example #18
0
    else:
        print("Error: seed:", seed, "failed")
        print("Rewards were", rewards[-1])


start = time.time()
for seed in np.random.randint(0, 2 ** 32, 8):
    # init policy, value fn
    input_size = 4
    output_size = 1
    layer_size = 16
    num_layers = 1
    activation = nn.ReLU

    model = TD3Model(
         policy = MLP(input_size, output_size, num_layers, layer_size, activation),
         q1_fn = MLP(input_size+output_size, 1, num_layers, layer_size, activation),
         q2_fn = MLP(input_size+output_size, 1, num_layers, layer_size, activation),
         act_limit=3
    )

    alg_config = {
        "env_name": env_name,
        "model": model,
        "seed": int(seed),  # int((time.time() % 1)*1e8),
        "train_steps" : 5e5,
        "exploration_steps": 500,
        "reward_stop": 1000,
        "gamma": .95,
        "act_std_schedule": (1, .1,),
        "replay_batch_size": 128,