Ejemplo n.º 1
0
input_size = 4
output_size = 1
layer_size = 0
num_layers = 0
activation = nn.ReLU


proc_list = []

for seed in range(200)[-7:]:
    #        policy = MLP(input_size, output_size, num_layers, layer_size, activation)
    policy = torch.load("warm/LQR_policy")
    value_fn = MLP(input_size, 1, num_layers, layer_size, activation)
    model = PPOModel(
            policy=policy,
            value_fn = value_fn,
            discrete=False,
#            hold_count = 0
        )

    def reward_fn(ns, act):
        return -1e-2*((ns[0] - np.pi)**2 + ns[1]**2 + .1*ns[2]**2 + .2*ns[3]**2)
    #return 1e-2*(np.cos(ns[0]) + np.cos(ns[0] + ns[1]))
    
    env_config = {
        "max_torque" : 25,
        "init_state" : [np.pi, 0.0, 0.0, 0.0],
        "init_state_weights" : np.array([0, 0, 0, 0]),
        "dt" : .02,
        "max_t" : 1,
        "act_hold" : 1,
        "fixed_step" : True,
Ejemplo n.º 2
0
torch.set_default_dtype(torch.double)
proc_list = []

for seed in [0]:

    policy = MLP(input_size, output_size, num_layers, layer_size, activation)

    # model = PPOModelActHold(
    #     policy=policy,
    #     value_fn=MLP(input_size, 1, num_layers, layer_size, activation),
    #     discrete=False,
    #     hold_count = 200
    # )

    model = PPOModel(policy=policy,
                     value_fn=MLP(input_size, 1, num_layers, layer_size,
                                  activation),
                     discrete=False)

    arg_dict = {
        "env_name": env_name,
        "model": model,
        "action_var_schedule": [1, 1],
        "seed": seed,  # int((time.time() % 1)*1e8),
        "num_epochs": 1000,
        "epoch_batch_size": 2048,
        "gamma": 1,
        "p_epochs": 10,
        "v_epochs": 10,
    }

    run_name = "ppo" + str(seed)
Ejemplo n.º 3
0
from seagul.old.sac.sac_ray import ray_sac
from seagul.nn import MLP
from seagul.rl.models import SACModel, PPOModel

input_size = 17
output_size = 6
layer_size = 64
num_layers = 2

policy = MLP(input_size, output_size * 2, num_layers, layer_size)
value_fn = MLP(input_size, 1, num_layers, layer_size)
q1_fn = MLP(input_size + output_size, 1, num_layers, layer_size)
q2_fn = MLP(input_size + output_size, 1, num_layers, layer_size)
model = SACModel(policy, value_fn, q1_fn, q2_fn, 3)

ppo_policy = MLP(input_size, output_size, num_layers, layer_size)
ppo_model = PPOModel(ppo_policy, value_fn)

env_name = "Walker2d-v2"
model, rews, var_dict = ray_sac(env_name,
                                100000,
                                model,
                                env_steps=1000,
                                iters_per_update=100,
                                min_steps_per_update=100,
                                reward_stop=1000,
                                exploration_steps=1000)
#model, rews, var_dict = ppo(env_name, 3e5, ppo_model)
globals().update(var_dict)
Ejemplo n.º 4
0
    return reward, done


env_config1 = {
    "init_state": [0, 0, 0, 0],
    "max_torque": max_torque,
    "init_state_weights": [0, 0, 0, 0],
    "dt": .01,
    "reward_fn": reward_fn,
    "max_t": max_t,
    "act_hold": 20
}

policy = MLP(input_size, output_size, num_layers, layer_size, activation)
value_fn = MLP(input_size, 1, num_layers, layer_size, activation)
model1 = PPOModel(policy=policy, value_fn=value_fn, action_var=1)
env_config2 = {
    "init_state": [0, 0, 0, 0],
    "max_torque": max_torque,
    "init_state_weights": [0, 0, 0, 0],
    "dt": .01,
    "reward_fn": reward_fn,
    "max_t": max_t,
    "act_hold": 1
}

model2 = PPOModelActHold(policy=policy,
                         value_fn=value_fn,
                         action_var=1,
                         hold_count=20)
Ejemplo n.º 5
0
from seagul.rl.algos import ppo
from seagul.nn import MLP
from seagul.rl.models import PPOModel
import torch

torch.set_default_dtype(torch.double)# TODO need to update everything to support arbitrary dtypes

input_size = 3
output_size = 1
layer_size = 64
num_layers = 2

policy = MLP(input_size, output_size, num_layers, layer_size)
value_fn = MLP(input_size, 1, num_layers, layer_size)
model = PPOModel(policy, value_fn)

model, rews, var_dict = ppo("Pendulum-v0", 10000, model)
Ejemplo n.º 6
0
for seed in [0,1,2,3]:



    policy = MLP(input_size, output_size, num_layers, layer_size, activation)
    value_fn = MLP(input_size, 1, num_layers, layer_size, activation)

    # model = PPOModelActHold(
    #     policy=policy,
    #     value_fn=MLP(input_size, 1, num_layers, layer_size, activation),
    #     discrete=False,
    #     hold_count = 10
    # )

    model = PPOModel(policy=policy,value_fn=value_fn , discrete=False)


    def reward_fn(s):
        if s[3] == 1:
            if s[0] > 2 and s[2] > 3:
                reward = 5.0
                s[3] = 0
            else:
                reward = 0.0

        elif s[3] == 0:
            if s[0] < -2 and s[2] < -3:
                reward = 5.0
                s[3] = 1
            else: