def main(args): # 1. Start a W&B run wandb.init(project='pearl', entity='adlr-ss-21-05') wandb.config.update(args) print("wandb name: ", wandb.run.name) log_dir = "tmp/" + wandb.run.name + "/" os.makedirs(log_dir, exist_ok=True) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, check_log=1, log_dir=log_dir, model_name=wandb.run.name) env = gym.make('kuka_iiwa_insertion-v0', use_gui=False, steps_per_action=args.steps_per_action, max_steps=args.max_steps, action_step_size=args.action_step_size) env = Monitor(env, log_dir) model = SAC("MlpPolicy", env, verbose=args.verbosity, train_freq=(args.train_freq_num, args.train_freq_type), batch_size=args.batch_size) i = 0 save_interval = 1000000 while True: i += save_interval model.learn(total_timesteps=save_interval, callback=callback)
def test_offpolicy_normalization(model_class, online_sampling): if online_sampling and model_class != HerReplayBuffer: pytest.skip() make_env_ = make_dict_env if model_class == HerReplayBuffer else make_env env = DummyVecEnv([make_env_]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0, clip_reward=10.0) eval_env = DummyVecEnv([make_env_]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=False, clip_obs=10.0, clip_reward=10.0) if model_class == HerReplayBuffer: model = SAC( "MultiInputPolicy", env, verbose=1, learning_starts=100, policy_kwargs=dict(net_arch=[64]), replay_buffer_kwargs=dict( max_episode_length=100, online_sampling=online_sampling, n_sampled_goal=2, ), replay_buffer_class=HerReplayBuffer, seed=2, ) else: model = model_class("MlpPolicy", env, verbose=1, learning_starts=100, policy_kwargs=dict(net_arch=[64])) model.learn(total_timesteps=150, eval_env=eval_env, eval_freq=75) # Check getter assert isinstance(model.get_vec_normalize_env(), VecNormalize)
def test_goal_selection_strategy(goal_selection_strategy, online_sampling): """ Test different goal strategies. """ env = BitFlippingEnv(continuous=True) normal_action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1)) model = SAC( "MultiInputPolicy", env, replay_buffer_class=HerReplayBuffer, replay_buffer_kwargs=dict( goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling, max_episode_length=10, n_sampled_goal=2, ), train_freq=4, gradient_steps=1, policy_kwargs=dict(net_arch=[64]), learning_starts=100, buffer_size=int(1e5), action_noise=normal_action_noise, ) assert model.action_noise is not None model.learn(total_timesteps=150)
def param_buff(): res1 = [0, 0.5, 1, 1.5, 2] res2 = [1000, 2000, 10000, 100000, 10**6] res3 = [500, 500, 5000, 5000, 5000] for j, k in zip(res2, res3): for i in res1: model = SAC( "MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=k, verbose=1, create_eval_env=True, buffer_size=j, ent_coef=i, action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)) #, #tensorboard_log="./sac_pendulum_tensorboard/" ) eval_env = gym.make('Pendulum-v0') eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/alpha2c', eval_freq=250, deterministic=True, render=False) model.learn(total_timesteps=20000, callback=eval_callback) return res1, res2
def main(do_render: bool, seed: int, as_gdads: bool, name: str, do_train: bool): drop_abs_position = True conf: Conf = CONFS[name] dict_env = get_env(name=name, drop_abs_position=drop_abs_position, is_training=True) if as_gdads: flat_env = SkillWrapper(env=dict_env) else: flat_env = flatten_env(dict_env, drop_abs_position) flat_env = TransformReward(flat_env, f=lambda r: r * conf.reward_scaling) flat_env = Monitor(flat_env) dict_env = get_env(name=name, drop_abs_position=drop_abs_position, is_training=False) if as_gdads: use_slider = False if use_slider: eval_env = SliderWrapper(env=dict_env) else: eval_env = GDADSEvalWrapper(dict_env, sw=BestSkillProvider(flat_env)) else: eval_env = flatten_env(dict_env=dict_env, drop_abs_position=drop_abs_position) filename = f"modelsCommandSkills/{name}/asGDADS{as_gdads}/resamplingFalse_goalSpaceTrue-seed-{seed}" if os.path.exists(filename + ".zip"): sac = SAC.load(filename + ".zip", env=flat_env) print(f"loaded model {filename}") if as_gdads: flat_env.load(filename) else: sac = SAC("MlpPolicy", env=flat_env, verbose=1, learning_rate=conf.lr, tensorboard_log=filename, buffer_size=conf.buffer_size, batch_size=conf.batch_size, gamma=gamma(conf.ep_len), learning_starts=100 * conf.ep_len, policy_kwargs=dict(log_std_init=-3, net_arch=[conf.layer_size] * 2), seed=seed, device="cuda", train_freq=4) if do_train: train(model=sac, conf=conf, save_fname=filename, eval_env=eval_env) if do_render: show(model=sac, env=eval_env, conf=conf) do_eval = not do_train and not do_render if do_eval: results = ant_grid_evaluation(model=sac, env=eval_env, episode_len=conf.ep_len) dump_ant_grid_evaluation(results)
def test_full_replay_buffer(): """ Test if HER works correctly with a full replay buffer when using online sampling. It should not sample the current episode which is not finished. """ n_bits = 4 env = BitFlippingEnv(n_bits=n_bits, continuous=True) # use small buffer size to get the buffer full model = SAC( "MultiInputPolicy", env, replay_buffer_class=HerReplayBuffer, replay_buffer_kwargs=dict( n_sampled_goal=2, goal_selection_strategy="future", online_sampling=True, max_episode_length=n_bits, ), gradient_steps=1, train_freq=4, policy_kwargs=dict(net_arch=[64]), learning_starts=1, buffer_size=20, verbose=1, seed=757, ) model.learn(total_timesteps=100)
def test_n_critics(n_critics): # Test SAC with different number of critics, for TD3, n_critics=1 corresponds to DDPG model = SAC("MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[64, 64], n_critics=n_critics), learning_starts=100, verbose=1) model.learn(total_timesteps=500)
def multiprocessing_with_off_policy_algorithms_example(): # Multiprocessing with off-policy algorithms. env = make_vec_env("Pendulum-v1", n_envs=4, seed=0) # We collect 4 transitions per call to 'env.step()' and performs 2 gradient steps per call to 'env.step()' # if gradient_steps=-1, then we would do 4 gradients steps per call to 'env.step()'. model = SAC("MlpPolicy", env, train_freq=1, gradient_steps=2, verbose=1) model.learn(total_timesteps=10_000)
def main(trained_agent_type, zoom_level): # mapping lunar lander controls to "W" (main engine), "A" (left engine), "D" (right engine) keys_to_action = { (ord('w'), ): 2, (ord('a'), ): 1, (ord('d'), ): 3, (ord('d'), ord('w')): 3, (ord('a'), ord('w')): 1, } # Checking for various trained_agent_type that might be selected. # 0: The human has full control. # 1: Trained with Sensor human and intervention penalty of 1 # 2: Trained with Noisy human and intervention penalty of 0.15 # 3: Trained with Noisy human and intervention penalty of 0.75 # 4: Ensemble of 1, 2, and 3. i.e. an action is sampled uniformly randomly from one of those agents at each timestep if trained_agent_type == 0: # this agent doesn't actually do anything, just a placeholder to satisfy HITLSBLunarLanderContEval's API hitl_agent = SAC.load('savedModels/sac_lunar_hitl_1p_sensor00.zip') eval_env = HITLSBLunarLanderContEval('LunarLanderContinuous-v2', hitl_agent, do_not_intervene=True) play(eval_env, zoom=zoom_level, fps=60, keys_to_action=keys_to_action, callback=print_rewards_callback) elif trained_agent_type == 4: hitl_agent1 = SAC.load('savedModels/sac_lunar_hitl_1p_sensor00.zip') hitl_agent2 = SAC.load('savedModels/sac_lunar_hitl_015p_noisy085.zip') hitl_agent3 = SAC.load('savedModels/sac_lunar_hitl_075p_noisy085.zip') eval_env = HITLSBLunarLanderContEval( 'LunarLanderContinuous-v2', [hitl_agent1, hitl_agent2, hitl_agent3]) play(eval_env, zoom=zoom_level, fps=60, keys_to_action=keys_to_action, callback=print_rewards_callback) else: if trained_agent_type == 1: HITL_LUNAR_AGENT_PATH = 'savedModels/sac_lunar_hitl_1p_sensor00.zip' elif trained_agent_type == 2: HITL_LUNAR_AGENT_PATH = 'savedModels/sac_lunar_hitl_015p_noisy085.zip' else: HITL_LUNAR_AGENT_PATH = 'savedModels/sac_lunar_hitl_075p_noisy085.zip' # load a saved human in the loop agent for LunarLander hitl_agent = SAC.load(HITL_LUNAR_AGENT_PATH) # create an instance of an evaluation environment, which takes in human actions in its "step" function eval_env = HITLSBLunarLanderContEval('LunarLanderContinuous-v2', hitl_agent) play(eval_env, zoom=zoom_level, fps=60, keys_to_action=keys_to_action, callback=print_rewards_callback)
def test_sac(ent_coef): model = SAC('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, create_eval_env=True, ent_coef=ent_coef, action_noise=NormalActionNoise(np.zeros(1), np.zeros(1))) model.learn(total_timesteps=1000, eval_freq=500)
def train(model: SAC, conf: Conf, save_fname: str, eval_env): model.env.reset() model.learn(total_timesteps=conf.ep_len * conf.num_episodes, log_interval=10, callback=[ eval_cb(env=eval_env, conf=conf, save_fname=save_fname), LogDeltaStatistics(n_steps=conf.ep_len), LogDeltasHistogram(env=model.env, freq_in_steps=25 * conf.ep_len) ], reset_num_timesteps=False)
def __init__(self, env, hyperparameters=DEFAULT_HYPERPARAMETERS): self.P = hyperparameters if self.P["model_class"] == "dqn": from stable_baselines3 import DQN self.model = DQN('MlpPolicy', env, verbose=self.P["verbose"]) self.model_class = DQN elif self.P["model_class"] == "a2c": from stable_baselines3 import A2C from stable_baselines3.a2c import MlpPolicy self.model = A2C(MlpPolicy, env, verbose=self.P["verbose"]) self.model_class = A2C elif self.P["model_class"] == "ddpg": from stable_baselines3 import DDPG from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) self.model = DDPG('MlpPolicy', env, action_noise=action_noise, verbose=self.P["verbose"]) self.model_class = DDPG elif self.P["model_class"] == "td3": from stable_baselines3 import TD3 from stable_baselines3.td3.policies import MlpPolicy from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) self.model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=self.P["verbose"]) self.model_class = TD3 elif self.P["model_class"] == "ppo": from stable_baselines3 import PPO from stable_baselines3.ppo import MlpPolicy self.model = PPO(MlpPolicy, env, verbose=self.P["verbose"]) self.model_class = PPO elif self.P["model_class"] == "sac": from stable_baselines3 import SAC from stable_baselines3.sac import MlpPolicy self.model = SAC(MlpPolicy, env, verbose=self.P["verbose"]) self.model_class = SAC else: raise NotImplementedError()
def test_train_freq_fail(train_freq): with pytest.raises(ValueError): model = SAC( "MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[64, 64], n_critics=1), learning_starts=100, buffer_size=10000, verbose=1, train_freq=train_freq, ) model.learn(total_timesteps=250)
def test_sac(ent_coef): model = SAC( "MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, create_eval_env=True, buffer_size=250, ent_coef=ent_coef, action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)), ) model.learn(total_timesteps=300, eval_freq=250)
def main(): """ # Example with a simple Dummy vec env env = gym.envs.make('panda-ip-reach-v0', renders= True) env = DummyVecEnv([lambda: env]) """ print("Env created !") env = PandaReachGymEnv(renders=True) env.render(mode='rgb_array') model = SAC.load("sac_panda_reach") print("model loaded !") while True: obs, done = env.reset(), False print("===================================") print("obs") print(obs) episode_rew = 0 #while not done: for i in range(50): env.render(mode='rgb_array') action, _states = model.predict(obs) obs, rew, done, info = env.step(action) episode_rew += rew if done: break print("Episode reward", episode_rew)
def test_save_load_pytorch_var(tmp_path): model = SAC("MlpPolicy", "Pendulum-v0", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1)) model.learn(200) save_path = str(tmp_path / "sac_pendulum") model.save(save_path) env = model.get_env() ent_coef_before = model.log_ent_coef del model model = SAC.load(save_path, env=env) assert th.allclose(ent_coef_before, model.log_ent_coef) model.learn(200) ent_coef_after = model.log_ent_coef # Check that the entropy coefficient is still optimized assert not th.allclose(ent_coef_before, ent_coef_after)
def create_model(env, algorithm, save_path): # the noise object n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.2) * np.ones(n_actions), theta=0.15) if algorithm == "ddpg": return DDPG(DDPG_MlpPolicy, env, learning_rate=0.001, buffer_size=1000000, batch_size=64, tau=0.001, gamma=0.99, train_freq=(10, "step"), action_noise=action_noise, policy_kwargs=dict(optimizer_class=th.optim.AdamW), tensorboard_log=save_path) elif algorithm == "td3": return TD3(TD3_MlpPolicy, env, action_noise=action_noise, tensorboard_log=save_path) elif algorithm == "sac": return SAC(SAC_MlpPolicy, env, action_noise=action_noise, tensorboard_log=save_path) else: raise Exception("--> Alican's LOG: Unknown agent type!")
def sac(env, hyper, policy="MlpPolicy", verbose=0, tensorboard_log=None, seed=0, use_sde=True, device="auto"): policy_kwargs = make_policy_kwargs(hyper, "sac") model = SAC( 'MlpPolicy', env, verbose=verbose, tensorboard_log=tensorboard_log, seed=seed, use_sde=use_sde, learning_rate=hyper['params_lr'], gamma=hyper['params_gamma'], batch_size=np.int(hyper['params_batch_size']), buffer_size=np.int(hyper['params_buffer_size']), learning_starts=np.int(hyper['params_learning_starts']), train_freq=np.int(hyper['params_train_freq']), tau=hyper['params_tau'], gradient_steps=np.int( hyper['params_train_freq']), # tuner assumes this policy_kwargs=policy_kwargs, device=device) return model
def get_perf(i): model = SAC("MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=5e3, verbose=1, create_eval_env=True, buffer_size=1000000, ent_coef=0.2, action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)), seed=42) saved_policy = MlpPolicy.load( "Pendulum-v0#test4SAC#custom#None#{}.zip".format(i)) mean_reward, std_reward = evaluate_policy(saved_policy, model.get_env(), n_eval_episodes=900) return mean_reward, std_reward
def __init__(self, algorithm: str, checkpoint_path: str): if algorithm == 'ppo': policy = PPO.load(checkpoint_path) elif algorithm == 'sac': policy = SAC.load(checkpoint_path) else: raise NotImplementedError self._model = policy
def test_sac(ent_coef, i): model = SAC( "MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=3000, verbose=1, create_eval_env=True, buffer_size=10000, ent_coef=ent_coef, action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)), #, target_update_interval=5000, #tensorboard_log="./sac_pendulum_tensorboard/" ) env = model.env eval_callback = EvalCallback(env, best_model_save_path='./logs/', log_path='./logs/without_target', eval_freq=250, deterministic=True, render=False) model.learn(total_timesteps=20000, eval_freq=250) """ definition = 200 portrait = np.zeros((definition, definition)) state_min = env.observation_space.low state_max = env.observation_space.high for index_t, t in enumerate(np.linspace(-np.pi, np.pi , num=definition)): for index_td, td in enumerate(np.linspace(state_min[2], state_max[2], num=definition)): state = torch.Tensor([[np.cos(t), np.sin(t), td]]) action = model.policy.forward(state) portrait[definition - (1 + index_td), index_t] = model.critic.q1_forward(state, action) plt.figure(figsize=(10, 10)) plt.imshow(portrait, cmap="inferno", extent=[-180, 180, state_min[2], state_max[2]], aspect='auto') plt.rc('axes', titlesize=12) plt.xlabel('angle') plt.ylabel('velocity') plt.colorbar(label="critic value") plt.scatter([0], [0]) plt.show() #policy = model.policy #policy.save("Pendulum-v0#test4SAC#custom#None#{}.zip".format(i)) #saved_policy = MlpPolicy.load("Pendulum-v0#test4SAC#custom#None#{}.zip".format(i)) #mean_reward, std_reward = evaluate_policy(saved_policy, model.get_env(), n_eval_episodes=10) #print(mean_reward, std_reward)""" return model.replay_buffer.rewards
def command_demo(args, config): agent, callback = _init_agent(args, config, train=False) model = SAC.load(args.model_path) obs = agent.reset() for step in range(args.time_steps): if step % 100 == 0: print("step: ", step) action, _states = model.predict(obs) obs, rewards, dones, info = agent.step(action)
def load_model(env, algorithm, filename): if algorithm == "ddpg": return DDPG.load(filename, env=env) elif algorithm == "td3": return TD3.load(filename, env=env) elif algorithm == "sac": return SAC.load(filename, env=env) else: raise Exception("--> Alican's LOG: Unknown agent type!")
def hindsight_experience_replay_example(): # Hindsight Experience Replay (HER). import highway_env env = gym.make("parking-v0") # Create 4 artificial transitions per real transition. n_sampled_goal = 4 # SAC hyperparams: model = SAC( "MultiInputPolicy", env, replay_buffer_class=HerReplayBuffer, replay_buffer_kwargs=dict( n_sampled_goal=n_sampled_goal, goal_selection_strategy="future", # IMPORTANT: because the env is not wrapped with a TimeLimit wrapper # we have to manually specify the max number of steps per episode. max_episode_length=100, online_sampling=True, ), verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95, batch_size=256, policy_kwargs=dict(net_arch=[256, 256, 256]), ) model.learn(int(2e5)) model.save("her_sac_highway") # Load saved model. # Because it needs access to 'env.compute_reward()' # HER must be loaded with the env. model = SAC.load("her_sac_highway", env=env) obs = env.reset() # Evaluate the agent. episode_reward = 0 for _ in range(100): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() episode_reward += reward if done or info.get("is_success", False): print("Reward:", episode_reward, "Success?", info.get("is_success", False)) episode_reward = 0.0 obs = env.reset()
def run(env, algname, filename): if algname == "TD3": model = TD3.load(f"{algname}_pkl") elif algname == "SAC": if filename: model = SAC.load(f"{filename}") else: model = SAC.load(f"{algname}_pkl") elif algname == "DDPG": model = DDPG.load(f"{algname}_pkl") else: raise "Wrong algorithm name provided." obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render() if done: break
def _load_sac(agent, args, config, policy): model = None if args.load_model == '': model = SAC("MlpPolicy", policy_kwargs=policy, env=agent, verbose=config.sac_verbose(), batch_size=config.sac_batch_size(), buffer_size=config.sac_buffer_size(), learning_starts=config.sac_learning_starts(), gradient_steps=config.sac_gradient_steps(), train_freq=config.sac_train_freq(), ent_coef=config.sac_ent_coef(), learning_rate=config.sac_learning_rate(), tensorboard_log="tblog", gamma=config.sac_gamma(), tau=config.sac_tau(), use_sde_at_warmup=config.sac_use_sde_at_warmup(), use_sde=config.sac_use_sde(), sde_sample_freq=config.sac_sde_sample_freq(), n_episodes_rollout=1) else: model = SAC.load(args.load_model, env=agent, policy_kwargs=policy, verbose=config.sac_verbose(), batch_size=config.sac_batch_size(), buffer_size=config.sac_buffer_size(), learning_starts=config.sac_learning_starts(), gradient_steps=config.sac_gradient_steps(), train_freq=config.sac_train_freq(), ent_coef=config.sac_ent_coef(), learning_rate=config.sac_learning_rate(), tensorboard_log="tblog", gamma=config.sac_gamma(), tau=config.sac_tau(), use_sde_at_warmup=config.sac_use_sde_at_warmup(), use_sde=config.sac_use_sde(), sde_sample_freq=config.sac_sample_freq(), n_episodes_rollout=1) return model
def train_sac(): latent_dim = 256 vae = CVAE(latent_dim) vae.load_weights('./vae_256/checkpoint') env1 = DonkeyVAEEnv(vae, latent_dim, "Helios1") # manual_override=None if you don't want to "help" the Agend with w,a,s,d # env1 = DonkeyVAEEnv(vae, latent_dim, "Helios1", manual_override=ManualOverride()) env1.client.collecting = False sac = SAC(env=env1, policy=MlpPolicy, buffer_size=20000, learning_starts=0, train_freq=20000, batch_size=256, verbose=2, gradient_steps=100, learning_rate=0.0005) # uncomment if you want to load a model and retrain it sac = sac.load("sac/model_sb3", env=env1) # sac = sac.load("sac/model_sb3_lake_36", env=env1) # sac = sac.load("sac/model_sb3_lake_36_unscaled", env=env1) env1.client.hardReset() env1.client.initCar() env1.client.reset() env1.client.restartScene() env1.client.hardReset() env1.client.initCar() env1.client.reset() env1.client.collecting = True env1.client.telemetrie = [] while True: observation, index = env1.get_observation() action = sac.predict(np.asarray([observation]), deterministic=False)[0][0] steering, throttle = action[0], action[1] env1.client.send_controls(steering * 0.4, throttle) # env1.client.send_controls(steering * 0.7, throttle * 0.8) print( str(index) + " steering:" + str(action[0]) + " throttle:" + str(action[1]) + " speed:" + str(env1.client.telemetrie[index].speed))
def main(): as_gdads = True name = "pointmass" drop_abs_position = True dads_env_fn = envs_fns[name] conf: Conf = CONFS[name] dict_env = as_dict_env(dads_env_fn()) dict_env = TimeLimit(dict_env, max_episode_steps=conf.ep_len) if drop_abs_position: dict_env = DropGoalEnvsAbsoluteLocation(dict_env) if as_gdads: flat_env = SkillWrapper(env=dict_env, skill_reset_steps=conf.ep_len // 2) else: flat_obs_content = ["observation", "desired_goal", "achieved_goal"] if drop_abs_position: flat_obs_content.remove("achieved_goal") # Because always 0 vector flat_env = FlattenObservation(FilterObservation(dict_env, filter_keys=flat_obs_content)) flat_env = TransformReward(flat_env, f=lambda r: r*conf.reward_scaling) flat_env = Monitor(flat_env) filename = f"modelsCommandSkills/{name}-gdads{as_gdads}" if os.path.exists(filename + ".zip"): sac = SAC.load(filename, env=flat_env) if as_gdads: flat_env.load(filename) else: sac = SAC("MlpPolicy", env=flat_env, verbose=1, learning_rate=conf.lr, tensorboard_log=f"{filename}-tb", buffer_size=10000) train(model=sac, conf=conf, save_fname=filename) if as_gdads: flat_env.save(filename) if as_gdads: flat_env.set_sac(sac) eval_dict_env(dict_env=dict_env, model=flat_env, ep_len=conf.ep_len) show(model=sac, env=flat_env, conf=conf)
def test_sac(ent_coef, i): model = SAC( "MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=5e3, verbose=1, create_eval_env=True, buffer_size=1000000, ent_coef=ent_coef, action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)) #, #tensorboard_log="./sac_pendulum_tensorboard/" ) eval_env = gym.make('Pendulum-v0') eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=250, deterministic=True, render=False) model.learn(total_timesteps=20000, callback=eval_callback)
def train_SAC(env, title="Stand Up Task Learning Curve"): print(f"action space shape -1:{env.action_space.shape[-1]}") # The noise objects for TD3 n_actions = env.action_space.shape[-1] callback = Logger(log_dir=log_dir) timesteps = 20000 model = SAC('MlpPolicy', env, learning_rate=0.001, learning_starts=10000, ent_coef='auto_1.1', train_freq=1, n_episodes_rollout=-1, target_entropy=-21, buffer_size=1000000, action_noise=None, batch_size=64, verbose=1, policy_kwargs=dict(net_arch=[64, 64])) model.learn(total_timesteps=timesteps, callback=callback) model.save("SAC_pkl") plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, title) plt.savefig("{}/learn_curve.png".format(log_dir)) plt.show()