policy=policy, value_fn=value_fn, q1_fn=q1_fn, q2_fn=q2_fn, act_limit=5, ) arg_dict = { "env_name": env_name, "model": model, "seed": seed, # int((time.time() % 1)*1e8), "total_steps": 5e5, "exploration_steps": 10000, "min_steps_per_update": 200, "reward_stop": 1500, "gamma": 1, } run_sg(arg_dict, sac, None, 'back to 200 ah', "/data/data2/sac/") # p = Process( # target=run_sg, # args=(arg_dict, ppo, None, "ppo2 drake acrobot with an act hold of 20, to see if Nans go away..", "/data2/ppo2_test/"), # ) # p.start() # proc_list.append(p) # for p in proc_list: # print("joining") # p.join()
alg_config = { "env_name": env_name, "model": model, "seed": int(seed), # int((time.time() % 1)*1e8), "train_steps": 1e6, "exploration_steps": 50000, "min_steps_per_update": 500, "reward_stop": 1000, "gamma": 1, "act_std_schedule": (.1, ), "sgd_batch_size": 64, "replay_batch_size": 2048, "iters_per_update": 1000, "env_max_steps": 1000, "polyak": .995 #"iters_per_update": float('inf'), } run_sg(alg_config, td3, "sac bullet defaults", "debug", "/data/" + "/" + "seed" + str(seed)) p = Process(target=run_and_test, args=[alg_config]) p.start() proc_list.append(p) for p in proc_list: p.join() print(f"Total time: {(time.time() - start)}")
hold_count=1) #model = PPOModel(policy=policy, value_fn=MLP(input_size, 1, num_layers, layer_size, activation), discrete=False) arg_dict = { "env_name": env_name, "model": model, "act_var_schedule": [.1], "seed": seed, # int((time.time() % 1)*1e8), "total_steps": 200 * 2048, "epoch_batch_size": 2048, "reward_stop": 900, "gamma": 1, "pol_epochs": 10, "val_epochs": 10, } run_sg(arg_dict, ppo, None, 'lets see if we can learn to balance', "/data/data2/10_sat/") # p = Process( # target=run_sg, # args=(arg_dict, ppo, None, "ppo2 drake acrobot with an act hold of 20, to see if Nans go away..", "/data2/ppo2_test/"), # ) # p.start() # proc_list.append(p) # for p in proc_list: # print("joining") # p.join()
nominal_policy=control, hold_count=20, ) arg_dict = { "env_name": env_name, "model": model, "total_steps": 500 * 2048, "epoch_batch_size": 2048, "act_var_schedule": [2, 2], "gate_var_schedule": [0.1, 0.1], "gamma": 1, "seed": seed, "reward_stop": 1500, } run_name = "25_ppo2" + str(seed) # import ipdb; ipdb.set_trace() # run_sg(arg_dict, ppo_switch, run_name, 'reasonable torque limits, and a new but cheaty warm start', "/data/switch4/") run_sg( arg_dict, ppo_switch, run_name, "trying to replicate earlier results that use ppo with ppo2", "/data/drake_ppo22/", ) print("finished run ", run_name)
from seagul.nn import MLP from seagul.rl.ppo.ppo2 import ppo import torch import torch.nn as nn ## init policy, valuefn input_size = 4 output_size = 1 layer_size = 64 num_layers = 3 activation = nn.ReLU torch.set_default_dtype(torch.double) model = PPOModel( policy=MLP(input_size, output_size, num_layers, layer_size, activation), value_fn=MLP(input_size, 1, num_layers, layer_size, activation), action_std=4, ) arg_dict = { "env_name": "su_cartpole-v0", "model": model, "num_epochs": 10, "action_var_schedule": [10, 0] } run_sg(arg_dict, ppo)
"env_name": env_name, "model": model, "total_steps": 500 * 2048, "epoch_batch_size": 2048, "act_var_schedule": [2, 2], "gate_var_schedule": [0.1, 0.1], "gamma": 1, "seed": seed, "reward_stop": 1500, } run_name = "1000_ppo2" + str(seed) # import ipdb; ipdb.set_trace() run_sg(arg_dict, ppo_switch, run_name, 'trying to replicate earlier work that kinda of worked ', "/data/data1/switch4/") # p = Process( # target=run_sg, # args=( # arg_dict, # ppo_switch, # run_name, # "trying to replicate earlier results that use ppo with ppo2", # "/data/data2/drake_ppo2/", # ), # ) # p.start() # proc_list.append(p)
"m2": m2, "m1": m1, "l1": l1, "lc1": lc1, "lc2": lc2, "i1": I1, "i2": I2, "act_hold": 20, "gate_fn": torch.load("../switching2/warm/lqr_gate_better"), "controller": control } proc_list = [] for seed in np.random.randint(0, 2**32, 1): alg_config = { "env_name": "su_acroswitchsin-v0", "total_steps": 500000, "model": model, "seed": seed, "goal_state": np.array([0, 1, 1, 0, 0, 0]), "goal_lookback": 10, "goal_thresh": 1.5, "iters_per_update": float('inf'), "exploration_steps": 50000, "env_config": env_config } run_sg(alg_config, sac, "smoke_test" + str(seed), "", "/data_needle/" + trial_name)
elif s[3] == 0: if s[0] < -2 and s[2] < -3: reward = 5.0 s[3] = 1 else: reward = -1.0 return reward, s env_config = {"num_steps": 500, "reward_fn": reward_fn} arg_dict = { "env_name": env_name, "model": model, "act_var_schedule": [1], "seed": seed, # int((time.time() % 1)*1e8), "total_steps": 5e5, "epoch_batch_size": 2048, "gamma": 1, "pol_epochs": 10, "val_epochs": 10, "env_config": env_config } run_name = "debug2" + str(seed) run_sg(arg_dict, ppo, run_name, "basic smoke test", "/data/seagul/") print("finished run ", run_name)
layer_size = 128 num_layers = 2 activation = nn.ReLU proc_list = [] policy = MLP(input_size, output_size * 2, num_layers, layer_size, activation) # Do I need to do weight sharing here? value_fn = MLP(input_size, 1, num_layers, layer_size, activation) q1_fn = MLP(input_size + output_size, 1, num_layers, layer_size, activation) q2_fn = MLP(input_size + output_size, 1, num_layers, layer_size, activation) model = SACModel(policy, value_fn, q1_fn, q2_fn, 1) arg_dict = { 'total_steps': 1e6, 'model': model, 'env_name': env_name, 'seed': 2, 'env_steps': 1000, 'iters_per_update': 3000, 'min_steps_per_update': 1000, 'reward_stop': 3000, 'exploration_steps': 10000, 'replay_batch_size': 100, 'use_gpu': False, } run_sg(arg_dict, sac, "/sac_walker0", "trying to get walker to work at all", "/sac_walker")