def test_save_load_pytorch_var(tmp_path): model = SAC("MlpPolicy", "Pendulum-v1", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1)) model.learn(200) save_path = str(tmp_path / "sac_pendulum") model.save(save_path) env = model.get_env() log_ent_coef_before = model.log_ent_coef del model model = SAC.load(save_path, env=env) assert th.allclose(log_ent_coef_before, model.log_ent_coef) model.learn(200) log_ent_coef_after = model.log_ent_coef # Check that the entropy coefficient is still optimized assert not th.allclose(log_ent_coef_before, log_ent_coef_after) # With a fixed entropy coef model = SAC("MlpPolicy", "Pendulum-v1", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1)) model.learn(200) save_path = str(tmp_path / "sac_pendulum") model.save(save_path) env = model.get_env() assert model.log_ent_coef is None ent_coef_before = model.ent_coef_tensor del model model = SAC.load(save_path, env=env) assert th.allclose(ent_coef_before, model.ent_coef_tensor) model.learn(200) ent_coef_after = model.ent_coef_tensor assert model.log_ent_coef is None # Check that the entropy coefficient is still the same assert th.allclose(ent_coef_before, ent_coef_after)
def main(trained_agent_type, zoom_level): # mapping lunar lander controls to "W" (main engine), "A" (left engine), "D" (right engine) keys_to_action = { (ord('w'), ): 2, (ord('a'), ): 1, (ord('d'), ): 3, (ord('d'), ord('w')): 3, (ord('a'), ord('w')): 1, } # Checking for various trained_agent_type that might be selected. # 0: The human has full control. # 1: Trained with Sensor human and intervention penalty of 1 # 2: Trained with Noisy human and intervention penalty of 0.15 # 3: Trained with Noisy human and intervention penalty of 0.75 # 4: Ensemble of 1, 2, and 3. i.e. an action is sampled uniformly randomly from one of those agents at each timestep if trained_agent_type == 0: # this agent doesn't actually do anything, just a placeholder to satisfy HITLSBLunarLanderContEval's API hitl_agent = SAC.load('savedModels/sac_lunar_hitl_1p_sensor00.zip') eval_env = HITLSBLunarLanderContEval('LunarLanderContinuous-v2', hitl_agent, do_not_intervene=True) play(eval_env, zoom=zoom_level, fps=60, keys_to_action=keys_to_action, callback=print_rewards_callback) elif trained_agent_type == 4: hitl_agent1 = SAC.load('savedModels/sac_lunar_hitl_1p_sensor00.zip') hitl_agent2 = SAC.load('savedModels/sac_lunar_hitl_015p_noisy085.zip') hitl_agent3 = SAC.load('savedModels/sac_lunar_hitl_075p_noisy085.zip') eval_env = HITLSBLunarLanderContEval( 'LunarLanderContinuous-v2', [hitl_agent1, hitl_agent2, hitl_agent3]) play(eval_env, zoom=zoom_level, fps=60, keys_to_action=keys_to_action, callback=print_rewards_callback) else: if trained_agent_type == 1: HITL_LUNAR_AGENT_PATH = 'savedModels/sac_lunar_hitl_1p_sensor00.zip' elif trained_agent_type == 2: HITL_LUNAR_AGENT_PATH = 'savedModels/sac_lunar_hitl_015p_noisy085.zip' else: HITL_LUNAR_AGENT_PATH = 'savedModels/sac_lunar_hitl_075p_noisy085.zip' # load a saved human in the loop agent for LunarLander hitl_agent = SAC.load(HITL_LUNAR_AGENT_PATH) # create an instance of an evaluation environment, which takes in human actions in its "step" function eval_env = HITLSBLunarLanderContEval('LunarLanderContinuous-v2', hitl_agent) play(eval_env, zoom=zoom_level, fps=60, keys_to_action=keys_to_action, callback=print_rewards_callback)
def main(): """ # Example with a simple Dummy vec env env = gym.envs.make('panda-ip-reach-v0', renders= True) env = DummyVecEnv([lambda: env]) """ print("Env created !") env = PandaReachGymEnv(renders=True) env.render(mode='rgb_array') model = SAC.load("sac_panda_reach") print("model loaded !") while True: obs, done = env.reset(), False print("===================================") print("obs") print(obs) episode_rew = 0 #while not done: for i in range(50): env.render(mode='rgb_array') action, _states = model.predict(obs) obs, rew, done, info = env.step(action) episode_rew += rew if done: break print("Episode reward", episode_rew)
def main(do_render: bool, seed: int, as_gdads: bool, name: str, do_train: bool): drop_abs_position = True conf: Conf = CONFS[name] dict_env = get_env(name=name, drop_abs_position=drop_abs_position, is_training=True) if as_gdads: flat_env = SkillWrapper(env=dict_env) else: flat_env = flatten_env(dict_env, drop_abs_position) flat_env = TransformReward(flat_env, f=lambda r: r * conf.reward_scaling) flat_env = Monitor(flat_env) dict_env = get_env(name=name, drop_abs_position=drop_abs_position, is_training=False) if as_gdads: use_slider = False if use_slider: eval_env = SliderWrapper(env=dict_env) else: eval_env = GDADSEvalWrapper(dict_env, sw=BestSkillProvider(flat_env)) else: eval_env = flatten_env(dict_env=dict_env, drop_abs_position=drop_abs_position) filename = f"modelsCommandSkills/{name}/asGDADS{as_gdads}/resamplingFalse_goalSpaceTrue-seed-{seed}" if os.path.exists(filename + ".zip"): sac = SAC.load(filename + ".zip", env=flat_env) print(f"loaded model {filename}") if as_gdads: flat_env.load(filename) else: sac = SAC("MlpPolicy", env=flat_env, verbose=1, learning_rate=conf.lr, tensorboard_log=filename, buffer_size=conf.buffer_size, batch_size=conf.batch_size, gamma=gamma(conf.ep_len), learning_starts=100 * conf.ep_len, policy_kwargs=dict(log_std_init=-3, net_arch=[conf.layer_size] * 2), seed=seed, device="cuda", train_freq=4) if do_train: train(model=sac, conf=conf, save_fname=filename, eval_env=eval_env) if do_render: show(model=sac, env=eval_env, conf=conf) do_eval = not do_train and not do_render if do_eval: results = ant_grid_evaluation(model=sac, env=eval_env, episode_len=conf.ep_len) dump_ant_grid_evaluation(results)
def command_demo(args, config): agent, callback = _init_agent(args, config, train=False) model = SAC.load(args.model_path) obs = agent.reset() for step in range(args.time_steps): if step % 100 == 0: print("step: ", step) action, _states = model.predict(obs) obs, rewards, dones, info = agent.step(action)
def __init__(self, algorithm: str, checkpoint_path: str): if algorithm == 'ppo': policy = PPO.load(checkpoint_path) elif algorithm == 'sac': policy = SAC.load(checkpoint_path) else: raise NotImplementedError self._model = policy
def test_train_freq(tmp_path, train_freq): model = SAC( "MlpPolicy", "Pendulum-v1", policy_kwargs=dict(net_arch=[64, 64], n_critics=1), learning_starts=100, buffer_size=10000, verbose=1, train_freq=train_freq, ) model.learn(total_timesteps=150) model.save(tmp_path / "test_save.zip") env = model.get_env() model = SAC.load(tmp_path / "test_save.zip", env=env) model.learn(total_timesteps=150) model = SAC.load(tmp_path / "test_save.zip", train_freq=train_freq, env=env) model.learn(total_timesteps=150)
def load_model(env, algorithm, filename): if algorithm == "ddpg": return DDPG.load(filename, env=env) elif algorithm == "td3": return TD3.load(filename, env=env) elif algorithm == "sac": return SAC.load(filename, env=env) else: raise Exception("--> Alican's LOG: Unknown agent type!")
def run(env, algname, filename): if algname == "TD3": model = TD3.load(f"{algname}_pkl") elif algname == "SAC": if filename: model = SAC.load(f"{filename}") else: model = SAC.load(f"{algname}_pkl") elif algname == "DDPG": model = DDPG.load(f"{algname}_pkl") else: raise "Wrong algorithm name provided." obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render() if done: break
def hindsight_experience_replay_example(): # Hindsight Experience Replay (HER). import highway_env env = gym.make("parking-v0") # Create 4 artificial transitions per real transition. n_sampled_goal = 4 # SAC hyperparams: model = SAC( "MultiInputPolicy", env, replay_buffer_class=HerReplayBuffer, replay_buffer_kwargs=dict( n_sampled_goal=n_sampled_goal, goal_selection_strategy="future", # IMPORTANT: because the env is not wrapped with a TimeLimit wrapper # we have to manually specify the max number of steps per episode. max_episode_length=100, online_sampling=True, ), verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95, batch_size=256, policy_kwargs=dict(net_arch=[256, 256, 256]), ) model.learn(int(2e5)) model.save("her_sac_highway") # Load saved model. # Because it needs access to 'env.compute_reward()' # HER must be loaded with the env. model = SAC.load("her_sac_highway", env=env) obs = env.reset() # Evaluate the agent. episode_reward = 0 for _ in range(100): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() episode_reward += reward if done or info.get("is_success", False): print("Reward:", episode_reward, "Success?", info.get("is_success", False)) episode_reward = 0.0 obs = env.reset()
def multiModelPredict(cycle, interactive, *argv): if interactive: plt.ion() figure, axes = plt.subplots(len(argv), 4, figsize=(12, 6)) figure.tight_layout() menv = HedgingEnv() menv.reset() market = menv.market envs = [] obsrv = [] models = [] actions = [] for model_name in argv: env = HedgingEnv() env.model_name = model_name envs += [env] models += [SAC.load(model_name)] obsrv += [env.reset(market)] actions += [None] while True: done = False for i in range(len(models)): actions[i], _states = models[i].predict(obsrv[i]) obsrv[i], _reward, done, _ = envs[i].step(actions[i]) if done: menv = HedgingEnv() menv.reset() market = menv.market # plot for i in range(len(envs)): axs = axes[i] env = envs[i] dashboard_axes(env, axs[0], axs[1], axs[2], axs[3]) plt.show() plt.pause(0.005) # pause a bit so that plots are updated for i in range(len(envs)): obsrv[i] = envs[i].reset(market) if interactive: plt.ioff()
def show(): ''' shows the i-PADS in Streamlit :return: ''' env = TwoDimEnv() model = SAC.load("longModel") st.title('Intelligent PADS') st.sidebar.write("d'où est parti le parachute?") rho = st.sidebar.slider('à quelle distance?', 0, 500, 0) theta = 2 * PI / 360 * st.sidebar.slider('avec quel angle?', 0, 360, 0) zed = st.sidebar.slider('à quelle hauteur?', 0, 100, 150) pitch = st.sidebar.slider('pitch', 0, 100, 50) location = st.sidebar.radio("Lieu", ['Fonsorbes', 'Paris', 'San Francisco']) lat_tg = LOC[location]['lat'] lon_tg = LOC[location]['lon'] df, df_path, df_col = run_episode(env, model, lat_tg, lon_tg, rho_init=rho, theta_init=theta, zed=zed) df_target = pd.DataFrame({'lat': [lat_tg], 'lon': [lon_tg]}) deck_map = st.empty() initial_view_state = pdk.ViewState(latitude=lat_tg, longitude=lon_tg, zoom=12, pitch=pitch) deck_map.pydeck_chart( pdk.Deck(map_style='mapbox://styles/mapbox/light-v9', initial_view_state=initial_view_state)) df_pathi = df_path.copy() for i in range(zed): df_pathi['path'][0] = df_path['path'][0][0:i + 1] layers = get_layers(df[i:i + 1], df[0:i], df_target, df_pathi, df_col[0:i + 1]) deck_map.pydeck_chart( pdk.Deck(map_style='mapbox://styles/mapbox/light-v9', initial_view_state=initial_view_state, layers=layers)) time.sleep(TIMESLEEP)
def test_sac(): log_dir = f"model_save/best_model_sac_cnn" env = ENV(istest=True) env.render = True env = Monitor(env, log_dir) model = SAC.load(log_dir) plot_results(f"model_save/") for i in range(10): state = env.reset() while True: action = model.predict(state) next_state, reward, done, info = env.step(action[0]) state = next_state # print("trying:",i,"action:", action,"now profit:",env.profit) if done: print('stock', i, ' total profit=', env.profit, ' buy hold=', env.buy_hold) break
def play(): env = gym.make('kuka_iiwa_insertion-v0', use_gui=True) model = SAC.load("models/kuka_iiwa_insertion-v0_sac_best_model", env=env) obs = env.reset() i = 0 episode_reward = 0.0 while True: i += 1 action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) episode_reward += rewards if i % 10 == 0 or dones: print(obs, episode_reward, rewards, info) if dones: print("=" * 20 + " RESET " + "=" * 20) episode_reward = 0 env.reset()
def show_print(): ''' for debug purposes, allows to check the content of the df passed to the layers :return: ''' env = TwoDimEnv() model = SAC.load("longModel") rho, theta, zed = 100, 0, 10 lat_tg, lon_tg = LOC['Fonsorbes']['lat'], LOC['Fonsorbes']['lon'] df, df_path = run_episode(env, model, lat_tg, lon_tg, theta_init=theta, zed=zed) df_pathi = df_path.copy() for i in range(zed): df_temp = pd.DataFrame([{'path': df_path['path'][0][0:i]}]) df_pathi.update(df_temp) print(df_pathi)
def train_sac(): latent_dim = 256 vae = CVAE(latent_dim) vae.load_weights('./vae_256/checkpoint') env1 = DonkeyVAEEnv(vae, latent_dim, "Helios1") # manual_override=None if you don't want to "help" the Agend with w,a,s,d # env1 = DonkeyVAEEnv(vae, latent_dim, "Helios1", manual_override=ManualOverride()) env1.client.collecting = False sac = SAC(env=env1, policy=MlpPolicy, buffer_size=20000, learning_starts=0, train_freq=20000, batch_size=256, verbose=2, gradient_steps=100, learning_rate=0.0005) # uncomment if you want to load a model and retrain it sac = sac.load("sac/model_sb3", env=env1) # sac = sac.load("sac/model_sb3_lake_36", env=env1) # sac = sac.load("sac/model_sb3_lake_36_unscaled", env=env1) env1.client.hardReset() env1.client.initCar() env1.client.reset() env1.client.restartScene() env1.client.hardReset() env1.client.initCar() env1.client.reset() env1.client.collecting = True env1.client.telemetrie = [] while True: observation, index = env1.get_observation() action = sac.predict(np.asarray([observation]), deterministic=False)[0][0] steering, throttle = action[0], action[1] env1.client.send_controls(steering * 0.4, throttle) # env1.client.send_controls(steering * 0.7, throttle * 0.8) print( str(index) + " steering:" + str(action[0]) + " throttle:" + str(action[1]) + " speed:" + str(env1.client.telemetrie[index].speed))
def _load_sac(agent, args, config, policy): model = None if args.load_model == '': model = SAC("MlpPolicy", policy_kwargs=policy, env=agent, verbose=config.sac_verbose(), batch_size=config.sac_batch_size(), buffer_size=config.sac_buffer_size(), learning_starts=config.sac_learning_starts(), gradient_steps=config.sac_gradient_steps(), train_freq=config.sac_train_freq(), ent_coef=config.sac_ent_coef(), learning_rate=config.sac_learning_rate(), tensorboard_log="tblog", gamma=config.sac_gamma(), tau=config.sac_tau(), use_sde_at_warmup=config.sac_use_sde_at_warmup(), use_sde=config.sac_use_sde(), sde_sample_freq=config.sac_sde_sample_freq(), n_episodes_rollout=1) else: model = SAC.load(args.load_model, env=agent, policy_kwargs=policy, verbose=config.sac_verbose(), batch_size=config.sac_batch_size(), buffer_size=config.sac_buffer_size(), learning_starts=config.sac_learning_starts(), gradient_steps=config.sac_gradient_steps(), train_freq=config.sac_train_freq(), ent_coef=config.sac_ent_coef(), learning_rate=config.sac_learning_rate(), tensorboard_log="tblog", gamma=config.sac_gamma(), tau=config.sac_tau(), use_sde_at_warmup=config.sac_use_sde_at_warmup(), use_sde=config.sac_use_sde(), sde_sample_freq=config.sac_sample_freq(), n_episodes_rollout=1) return model
def main(): as_gdads = True name = "pointmass" drop_abs_position = True dads_env_fn = envs_fns[name] conf: Conf = CONFS[name] dict_env = as_dict_env(dads_env_fn()) dict_env = TimeLimit(dict_env, max_episode_steps=conf.ep_len) if drop_abs_position: dict_env = DropGoalEnvsAbsoluteLocation(dict_env) if as_gdads: flat_env = SkillWrapper(env=dict_env, skill_reset_steps=conf.ep_len // 2) else: flat_obs_content = ["observation", "desired_goal", "achieved_goal"] if drop_abs_position: flat_obs_content.remove("achieved_goal") # Because always 0 vector flat_env = FlattenObservation(FilterObservation(dict_env, filter_keys=flat_obs_content)) flat_env = TransformReward(flat_env, f=lambda r: r*conf.reward_scaling) flat_env = Monitor(flat_env) filename = f"modelsCommandSkills/{name}-gdads{as_gdads}" if os.path.exists(filename + ".zip"): sac = SAC.load(filename, env=flat_env) if as_gdads: flat_env.load(filename) else: sac = SAC("MlpPolicy", env=flat_env, verbose=1, learning_rate=conf.lr, tensorboard_log=f"{filename}-tb", buffer_size=10000) train(model=sac, conf=conf, save_fname=filename) if as_gdads: flat_env.save(filename) if as_gdads: flat_env.set_sac(sac) eval_dict_env(dict_env=dict_env, model=flat_env, ep_len=conf.ep_len) show(model=sac, env=flat_env, conf=conf)
def singleModelPredict(model_name, cycle=5, interactive=True): if interactive: plt.ion() env = HedgingEnv() #env.mu = -0.5 model = SAC.load(model_name) obs = env.reset() cnt = 0 while True: reward_history = [] action, _states = model.predict(obs) obs, reward, done, _ = env.step(action) reward_history += [reward] if done: env.render() obs = env.reset() cnt += 1 if cnt > cycle: break if interactive: plt.ioff()
parser.add_argument('-m', '--run_mode', default='train') parser.add_argument('-s', '--simulator', default='mujoco') args = parser.parse_args() version = args.version task_name = args.task_name run_mode = args.run_mode simulator = args.simulator if args.load_version is None: best_model_save_path = './{}/{}/SAC-v{}/logs/best_model.zip'.format(simulator, task_name, version) else: best_model_save_path = './{}/{}/SAC-v{}/logs/best_model.zip'.format(simulator, task_name, args.load_version) env = build_env(task_name, version, run_mode, simulator, visual=True, ctrl_delay=True) model = SAC.load(best_model_save_path, device=torch.device('cuda:0')) obs = env.reset() total_reward = 0 for i in range(10000): action, _states = model.predict(obs, deterministic=True) # action = np.array([-10, 30, -75, # 10, 30, -75, # -10, 50, -75, # 10, 50, -75]) * np.pi / 180 # action = env.action_space.sample() obs, reward, done, info = env.step(action) print(info['energy']) total_reward += reward
from stable_baselines3 import PPO, SAC from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize from common import common import common.gym_interface as gym_interface if __name__ == "__main__": hyperparams = common.load_hyperparameters(conf_name="SAC") venv = DummyVecEnv([gym_interface.make_env(robot_body=300)]) keys_remove = ["normalize", "n_envs", "n_timesteps", "policy"] for key in keys_remove: if key in hyperparams: del hyperparams[key] model = SAC('MlpPolicy', venv, verbose=1, seed=common.seed, **hyperparams) model.save("output_data/tmp/tmp") model = SAC.load("output_data/tmp/tmp.zip") model = SAC.load("output_data/models/best_model.zip")
# Step 3.b. To make Vectorized Environment to be able to use Normalize or FramStack (Optional) env = make_vec_env(lambda: env, n_envs=1) # Step 3.b Passing through Normalization and stack frame (Optional) env = VecFrameStack( env, n_stack=custom_params['FRAME_STACK']) # Use 1 for now because we use image if not custom_params['USING_VAE']: env = VecTransposeImage(env) # Uncomment if using 3d obs if custom_params['USING_NORMALIZATION']: env = VecNormalize.load(osp.join(results_dir, "vec_normalization.pkl"), env) # Load the agent if custom_params['algo'] == 'sac': model = SAC.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'a2c': model = A2C.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'dqn': model = DQN.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'ppo': model = PPO.load(osp.join(results_dir, "best_model", "best_model.zip")) else: raise ValueError("Error model") # Load the saved statistics # do not update them at test time env.training = False # reward normalization is not needed at test time env.norm_reward = False
gcloud.read_from_bucket(client, bucket_name, model_path) model_path = './' + model_path else: model_path = args.model model = None if args.algorithm == 'DQN': model = DQN.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'DDPG': model = DDPG.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'A2C': model = A2C.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'PPO': model = PPO.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'SAC': model = SAC.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'TD3': model = TD3.load(model_path, tensorboard_log=args.tensorboard) else: raise RuntimeError('Algorithm specified is not registered.') model.set_env(env) # ---------------------------------------------------------------------------- # # Calculating total training timesteps based on number of episodes # # ---------------------------------------------------------------------------- # n_timesteps_episode = env.simulator._eplus_one_epi_len / \ env.simulator._eplus_run_stepsize timesteps = args.episodes * n_timesteps_episode - 1 # ---------------------------------------------------------------------------- #
from stable_baselines3 import SAC import pybullet_envs import gym from stable_baselines3.common.callbacks import EvalCallback ENV_NAME = 'HalfCheetahBulletEnv-v0' TIME_STEPS = 100000 env = gym.make(ENV_NAME) eval_env = gym.make(ENV_NAME) model = SAC.load("logs/best_model") max_v = 0 min_v = 0 total_reward = 0 env.render() obs = env.reset() for i in range(100000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) total_reward += reward env.render() if done: obs = env.reset() print('Test reward is {:.3f}.'.format(total_reward)) total_reward = 0 env.close()
simulator, visual=False, ctrl_delay=True) eval_callback = EvalCallback(eval_env, best_model_save_path=best_model_save_path, log_path=log_path, eval_freq=5000, deterministic=True, render=False) policy_kwargs = dict(activation_fn=torch.nn.ReLU, net_arch=net_arch) if args.load_version is not None: best_model_dir = './{}/{}/SAC-v{}/logs/best_model.zip'.format( simulator, task_name, args.load_version) model = SAC.load(best_model_dir, device=torch.device('cuda:0')) model.set_env(env) model.tensorboard_log = tensorboard_log model.num_timesteps = 0 model.learning_starts = args.learning_starts model.buffer_size = args.buffer_size model.learning_rate = learning_rate if ent_coef == 'auto': init_value = 1.0 model.log_ent_coef = torch.log( torch.ones(1, device=model.device) * init_value).requires_grad_(True) model.ent_coef_optimizer = torch.optim.Adam( [model.log_ent_coef], lr=model.lr_schedule(1)) else:
def advanced_saving_and_loading_example(): # Advanced Saving and Loading. from stable_baselines3.sac.policies import MlpPolicy # Create the model, the training environment and the test environment (for evaluation). model = SAC('MlpPolicy', 'Pendulum-v1', verbose=1, learning_rate=1e-3, create_eval_env=True) # Evaluate the model every 1000 steps on 5 test episodes and save the evaluation to the "logs/" folder. model.learn(6000, eval_freq=1000, n_eval_episodes=5, eval_log_path="./logs/") # Save the model. model.save("sac_pendulum") # The saved model does not contain the replay buffer. loaded_model = SAC.load("sac_pendulum") print( f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer" ) # Now save the replay buffer too. model.save_replay_buffer("sac_replay_buffer") # Load it into the loaded_model. loaded_model.load_replay_buffer("sac_replay_buffer") # Now the loaded replay is not empty anymore. print( f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer" ) # Save the policy independently from the model. # Note: if you don't save the complete model with 'model.save()' # you cannot continue training afterward. policy = model.policy policy.save("sac_policy_pendulum") # Retrieve the environment. env = model.get_env() # Evaluate the policy. mean_reward, std_reward = evaluate_policy(policy, env, n_eval_episodes=10, deterministic=True) print(f"mean_reward={mean_reward:.2f} +/- {std_reward}") # Load the policy independently from the model. saved_policy = MlpPolicy.load("sac_policy_pendulum") # Evaluate the loaded policy. mean_reward, std_reward = evaluate_policy(saved_policy, env, n_eval_episodes=10, deterministic=True) print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
for i in range(start_loop, end_loop): print("EVAL ", i) avg_dis_reward_run = [] for j in range(0, 10): print("SEED 0") # lambd = np.load(f"./{args.folder}/buffers/lambda_{args.algo}_{j}.npy") # N = np.load(f"./{args.folder}/buffers/N_{args.algo}_{j}.npy") model_name = f"./{args.folder}/models/model_{args.algo}_{j}_{i}" #print ("Lambd N i ", lambd[i], N[i]) env.set_N(int(N[i]), list(lambd[i])) if args.algo == 0: model = PPO.load(model_name, env) elif args.algo == 1: model = A2C.load(model_name, env) elif args.algo == 2: model = SAC.load(model_name, env) elif args.algo == 3: thres_vec = np.load( f"./{args.folder}/buffers/thresvec_{args.env_name}_{j}.npy" ) model.set_threshold_vec(thres_vec[i]) avg_dis_reward = 0.0 for k in range(100): env.seed(k) obs = env.reset() reward_traj = [] dis_reward = 0.0 for t in range(int(1e3)): if args.algo == 3: action = model.select_action(np.array(obs), eval_=True) else:
def eval_model(env, model_name): env.reset() env.reset_dymola() mode = 'load' if mode == 'load': model = SAC.load(model_name, env=env) else: model = SAC(MlpPolicy, env, learning_rate=10**-4, verbose=1, tensorboard_log='tensorboard_log') tic = time.time() env.reset() model.learn(10000, reset_num_timesteps=False) model.save("IEEE9_5k_v4") toc = time.time() print(toc-tic) obs = env.reset() actions = [] rewards = [] for _ in range(250): action = model.predict(obs)[0] actions += action.tolist() obs, reward, done, info = env.step(action) rewards += [reward] volt_norm = [] legend = [] fig, ax = plt.subplots(5,3,figsize=(40,30)) for i in range(3): for j in range(3): bus = 1 + 3*i + j bus_volt = np.array(env.debug_data[f'iEEE_14_Buses.B{bus}.V'])-1.0 volt_norm += [bus_volt] ax[i][j].plot(env.debug_data['iEEE_14_Buses.my_time'], bus_volt, color='r') ax[i][j].set_ylabel('Voltage Dev') ax[i][j].set_title(f'Bus {bus}') legend += ['RL Agent'] gen = ['G2.gENROU.P','G1.gENSAL.P'] for j in range(2): ax[3][j].plot(env.debug_data['iEEE_14_Buses.my_time'], env.debug_data[f'iEEE_14_Buses.{gen[j]}'], color='r') ax[3][j].set_xlabel('Time (sec)') ax[3][j].set_title(f'{gen[j]} Output') ax[4][j].plot(np.arange(250), actions[j::2], color='r') ax[3][2].plot(env.debug_data['iEEE_14_Buses.my_time'],np.divide(np.cumsum(np.linalg.norm(volt_norm, axis=0)),np.clip(env.debug_data['iEEE_14_Buses.my_time'], 1, np.inf)), color='r') ax[3][2].plot(env.debug_data['iEEE_14_Buses.my_time'],np.linalg.norm(volt_norm, axis=0), color='r') ax[4][2].plot(np.arange(250), rewards, color='r') env.reset() actions = [] rewards = [] for _ in range(250): action = env.action_space.sample() actions += action.tolist() obs, reward, done, info = env.step(action) rewards += [reward] volt_norm = [] for i in range(3): for j in range(3): bus = 1 + 3*i + j bus_volt = np.array(env.debug_data[f'iEEE_14_Buses.B{bus}.V'])-1.0 volt_norm += [bus_volt] ax[i][j].plot(env.debug_data['iEEE_14_Buses.my_time'], bus_volt, color='b') legend += ['Randomized'] for j in range(2): ax[3][j].plot(env.debug_data['iEEE_14_Buses.my_time'], env.debug_data[f'iEEE_14_Buses.{gen[j]}'], color='b') ax[4][j].plot(np.arange(250), actions[j::2], color='b') ax[3][2].plot(env.debug_data['iEEE_14_Buses.my_time'],np.linalg.norm(volt_norm, axis=0), color='b') ax[3][2].plot(env.debug_data['iEEE_14_Buses.my_time'],np.divide(np.cumsum(np.linalg.norm(volt_norm, axis=0)),np.clip(env.debug_data['iEEE_14_Buses.my_time'], 1, np.inf)), color='b') ax[4][2].plot(np.arange(250), rewards, color='b') env.reset() actions = [] rewards = [] for _ in range(250): action = env.default_action # null action actions += action obs, reward, done, info = env.step(action) rewards += [reward] volt_norm = [] for i in range(3): for j in range(3): bus = 1 + 3*i + j bus_volt = np.array(env.debug_data[f'iEEE_14_Buses.B{bus}.V'])-1.0 volt_norm += [bus_volt] ax[i][j].plot(env.debug_data['iEEE_14_Buses.my_time'], bus_volt, color='g') legend += ['Do Nothing'] for j in range(2): ax[3][j].plot(env.debug_data['iEEE_14_Buses.my_time'], env.debug_data[f'iEEE_14_Buses.{gen[j]}'], color='g') ax[4][j].plot(np.arange(250), actions[j::2], color='g') ax[3][2].plot(env.debug_data['iEEE_14_Buses.my_time'],np.divide(np.cumsum(np.linalg.norm(volt_norm, axis=0)),np.clip(env.debug_data['iEEE_14_Buses.my_time'], 1, np.inf)), color='g') ax[3][2].plot(env.debug_data['iEEE_14_Buses.my_time'],np.linalg.norm(volt_norm, axis=0), color='g') ax[0][2].legend(legend) ax[4][2].plot(np.arange(250), rewards, color='g') env.dymola.close() plt.savefig(model_name) plt.show() return
#### Load the model from file ############################## algo = ARGS.exp.split("-")[2] if os.path.isfile(ARGS.exp + '/success_model.zip'): path = ARGS.exp + '/success_model.zip' elif os.path.isfile(ARGS.exp + '/best_model.zip'): path = ARGS.exp + '/best_model.zip' else: print("[ERROR]: no model under the specified path", ARGS.exp) if algo == 'a2c': model = A2C.load(path) if algo == 'ppo': model = PPO.load(path) if algo == 'sac': model = SAC.load(path) if algo == 'td3': model = TD3.load(path) if algo == 'ddpg': model = DDPG.load(path) #### Parameters to recreate the environment ################ env_name = ARGS.exp.split("-")[1] + "-aviary-v0" OBS = ObservationType.KIN if ARGS.exp.split( "-")[3] == 'kin' else ObservationType.RGB if ARGS.exp.split("-")[4] == 'rpm': ACT = ActionType.RPM elif ARGS.exp.split("-")[4] == 'dyn': ACT = ActionType.DYN elif ARGS.exp.split("-")[4] == 'pid': ACT = ActionType.PID
import os from funcy import last from glob import glob from self_driving import SelfDriving from stable_baselines3 import SAC from stable_baselines3.common.evaluation import evaluate_policy env = SelfDriving() model_path = last(sorted(glob('log/*.zip'), key=lambda f: os.stat(f).st_mtime)) model = SAC.load(model_path, env) print(model_path) reward_mean, _ = evaluate_policy(model, env, n_eval_episodes=1, render=True, warn=False) print(f'reward: {reward_mean:.02f}') for _ in range(10): env.seed(None) # 乱数シードをNone(現在時刻を使う)に設定します。 observation = env.reset() done = False while not done: action, _ = model.predict(observation, deterministic=True)