コード例 #1
0
def test_save_load_pytorch_var(tmp_path):
    model = SAC("MlpPolicy", "Pendulum-v1", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1))
    model.learn(200)
    save_path = str(tmp_path / "sac_pendulum")
    model.save(save_path)
    env = model.get_env()
    log_ent_coef_before = model.log_ent_coef

    del model

    model = SAC.load(save_path, env=env)
    assert th.allclose(log_ent_coef_before, model.log_ent_coef)
    model.learn(200)
    log_ent_coef_after = model.log_ent_coef
    # Check that the entropy coefficient is still optimized
    assert not th.allclose(log_ent_coef_before, log_ent_coef_after)

    # With a fixed entropy coef
    model = SAC("MlpPolicy", "Pendulum-v1", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1))
    model.learn(200)
    save_path = str(tmp_path / "sac_pendulum")
    model.save(save_path)
    env = model.get_env()
    assert model.log_ent_coef is None
    ent_coef_before = model.ent_coef_tensor

    del model

    model = SAC.load(save_path, env=env)
    assert th.allclose(ent_coef_before, model.ent_coef_tensor)
    model.learn(200)
    ent_coef_after = model.ent_coef_tensor
    assert model.log_ent_coef is None
    # Check that the entropy coefficient is still the same
    assert th.allclose(ent_coef_before, ent_coef_after)
コード例 #2
0
def main(trained_agent_type, zoom_level):
    # mapping lunar lander controls to "W" (main engine), "A" (left engine), "D" (right engine)
    keys_to_action = {
        (ord('w'), ): 2,
        (ord('a'), ): 1,
        (ord('d'), ): 3,
        (ord('d'), ord('w')): 3,
        (ord('a'), ord('w')): 1,
    }

    # Checking for various trained_agent_type that might be selected.
    # 0: The human has full control.
    # 1: Trained with Sensor human and intervention penalty of 1
    # 2: Trained with Noisy human and intervention penalty of 0.15
    # 3: Trained with Noisy human and intervention penalty of 0.75
    # 4: Ensemble of 1, 2, and 3. i.e. an action is sampled uniformly randomly from one of those agents at each timestep
    if trained_agent_type == 0:
        # this agent doesn't actually do anything, just a placeholder to satisfy HITLSBLunarLanderContEval's API
        hitl_agent = SAC.load('savedModels/sac_lunar_hitl_1p_sensor00.zip')
        eval_env = HITLSBLunarLanderContEval('LunarLanderContinuous-v2',
                                             hitl_agent,
                                             do_not_intervene=True)
        play(eval_env,
             zoom=zoom_level,
             fps=60,
             keys_to_action=keys_to_action,
             callback=print_rewards_callback)
    elif trained_agent_type == 4:
        hitl_agent1 = SAC.load('savedModels/sac_lunar_hitl_1p_sensor00.zip')
        hitl_agent2 = SAC.load('savedModels/sac_lunar_hitl_015p_noisy085.zip')
        hitl_agent3 = SAC.load('savedModels/sac_lunar_hitl_075p_noisy085.zip')
        eval_env = HITLSBLunarLanderContEval(
            'LunarLanderContinuous-v2',
            [hitl_agent1, hitl_agent2, hitl_agent3])
        play(eval_env,
             zoom=zoom_level,
             fps=60,
             keys_to_action=keys_to_action,
             callback=print_rewards_callback)
    else:
        if trained_agent_type == 1:
            HITL_LUNAR_AGENT_PATH = 'savedModels/sac_lunar_hitl_1p_sensor00.zip'
        elif trained_agent_type == 2:
            HITL_LUNAR_AGENT_PATH = 'savedModels/sac_lunar_hitl_015p_noisy085.zip'
        else:
            HITL_LUNAR_AGENT_PATH = 'savedModels/sac_lunar_hitl_075p_noisy085.zip'

        # load a saved human in the loop agent for LunarLander
        hitl_agent = SAC.load(HITL_LUNAR_AGENT_PATH)
        # create an instance of an evaluation environment, which takes in human actions in its "step" function
        eval_env = HITLSBLunarLanderContEval('LunarLanderContinuous-v2',
                                             hitl_agent)
        play(eval_env,
             zoom=zoom_level,
             fps=60,
             keys_to_action=keys_to_action,
             callback=print_rewards_callback)
コード例 #3
0
def main():
    """
    # Example with a simple Dummy vec env
    env = gym.envs.make('panda-ip-reach-v0', renders= True)
    env = DummyVecEnv([lambda: env])
    """
    print("Env created !")

    env = PandaReachGymEnv(renders=True)

    env.render(mode='rgb_array')

    model = SAC.load("sac_panda_reach")
    print("model loaded !")

    while True:
        obs, done = env.reset(), False
        print("===================================")
        print("obs")
        print(obs)
        episode_rew = 0
        #while not done:
        for i in range(50):
            env.render(mode='rgb_array')
            action, _states = model.predict(obs)
            obs, rew, done, info = env.step(action)
            episode_rew += rew
            if done:
                break
        print("Episode reward", episode_rew)
コード例 #4
0
def main(do_render: bool, seed: int, as_gdads: bool, name: str,
         do_train: bool):
    drop_abs_position = True

    conf: Conf = CONFS[name]
    dict_env = get_env(name=name,
                       drop_abs_position=drop_abs_position,
                       is_training=True)
    if as_gdads:
        flat_env = SkillWrapper(env=dict_env)
    else:
        flat_env = flatten_env(dict_env, drop_abs_position)
    flat_env = TransformReward(flat_env, f=lambda r: r * conf.reward_scaling)
    flat_env = Monitor(flat_env)

    dict_env = get_env(name=name,
                       drop_abs_position=drop_abs_position,
                       is_training=False)
    if as_gdads:
        use_slider = False
        if use_slider:
            eval_env = SliderWrapper(env=dict_env)
        else:
            eval_env = GDADSEvalWrapper(dict_env,
                                        sw=BestSkillProvider(flat_env))
    else:
        eval_env = flatten_env(dict_env=dict_env,
                               drop_abs_position=drop_abs_position)

    filename = f"modelsCommandSkills/{name}/asGDADS{as_gdads}/resamplingFalse_goalSpaceTrue-seed-{seed}"
    if os.path.exists(filename + ".zip"):
        sac = SAC.load(filename + ".zip", env=flat_env)
        print(f"loaded model {filename}")
        if as_gdads:
            flat_env.load(filename)
    else:
        sac = SAC("MlpPolicy",
                  env=flat_env,
                  verbose=1,
                  learning_rate=conf.lr,
                  tensorboard_log=filename,
                  buffer_size=conf.buffer_size,
                  batch_size=conf.batch_size,
                  gamma=gamma(conf.ep_len),
                  learning_starts=100 * conf.ep_len,
                  policy_kwargs=dict(log_std_init=-3,
                                     net_arch=[conf.layer_size] * 2),
                  seed=seed,
                  device="cuda",
                  train_freq=4)
    if do_train:
        train(model=sac, conf=conf, save_fname=filename, eval_env=eval_env)
    if do_render:
        show(model=sac, env=eval_env, conf=conf)
    do_eval = not do_train and not do_render
    if do_eval:
        results = ant_grid_evaluation(model=sac,
                                      env=eval_env,
                                      episode_len=conf.ep_len)
        dump_ant_grid_evaluation(results)
コード例 #5
0
ファイル: subcommand.py プロジェクト: jessecha/airc-rl-agent
def command_demo(args, config):
    agent, callback = _init_agent(args, config, train=False)
    model = SAC.load(args.model_path)
    obs = agent.reset()
    for step in range(args.time_steps):
        if step % 100 == 0: print("step: ", step)
        action, _states = model.predict(obs)
        obs, rewards, dones, info = agent.step(action)
コード例 #6
0
 def __init__(self, algorithm: str, checkpoint_path: str):
     if algorithm == 'ppo':
         policy = PPO.load(checkpoint_path)
     elif algorithm == 'sac':
         policy = SAC.load(checkpoint_path)
     else:
         raise NotImplementedError
     self._model = policy
コード例 #7
0
def test_train_freq(tmp_path, train_freq):

    model = SAC(
        "MlpPolicy",
        "Pendulum-v1",
        policy_kwargs=dict(net_arch=[64, 64], n_critics=1),
        learning_starts=100,
        buffer_size=10000,
        verbose=1,
        train_freq=train_freq,
    )
    model.learn(total_timesteps=150)
    model.save(tmp_path / "test_save.zip")
    env = model.get_env()
    model = SAC.load(tmp_path / "test_save.zip", env=env)
    model.learn(total_timesteps=150)
    model = SAC.load(tmp_path / "test_save.zip", train_freq=train_freq, env=env)
    model.learn(total_timesteps=150)
コード例 #8
0
 def load_model(env, algorithm, filename):
     if algorithm == "ddpg":
         return DDPG.load(filename, env=env)
     elif algorithm == "td3":
         return TD3.load(filename, env=env)
     elif algorithm == "sac":
         return SAC.load(filename, env=env)
     else:
         raise Exception("--> Alican's LOG: Unknown agent type!")
コード例 #9
0
def run(env, algname, filename):
    if algname == "TD3":
        model = TD3.load(f"{algname}_pkl")
    elif algname == "SAC":
        if filename:
            model = SAC.load(f"{filename}")
        else:
            model = SAC.load(f"{algname}_pkl")
    elif algname == "DDPG":
        model = DDPG.load(f"{algname}_pkl")
    else:
        raise "Wrong algorithm name provided."

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
        if done:
            break
コード例 #10
0
def hindsight_experience_replay_example():
    # Hindsight Experience Replay (HER).

    import highway_env

    env = gym.make("parking-v0")

    # Create 4 artificial transitions per real transition.
    n_sampled_goal = 4

    # SAC hyperparams:
    model = SAC(
        "MultiInputPolicy",
        env,
        replay_buffer_class=HerReplayBuffer,
        replay_buffer_kwargs=dict(
            n_sampled_goal=n_sampled_goal,
            goal_selection_strategy="future",
            # IMPORTANT: because the env is not wrapped with a TimeLimit wrapper
            # we have to manually specify the max number of steps per episode.
            max_episode_length=100,
            online_sampling=True,
        ),
        verbose=1,
        buffer_size=int(1e6),
        learning_rate=1e-3,
        gamma=0.95,
        batch_size=256,
        policy_kwargs=dict(net_arch=[256, 256, 256]),
    )

    model.learn(int(2e5))
    model.save("her_sac_highway")

    # Load saved model.
    # Because it needs access to 'env.compute_reward()'
    # HER must be loaded with the env.
    model = SAC.load("her_sac_highway", env=env)

    obs = env.reset()

    # Evaluate the agent.
    episode_reward = 0
    for _ in range(100):
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        env.render()
        episode_reward += reward
        if done or info.get("is_success", False):
            print("Reward:", episode_reward, "Success?",
                  info.get("is_success", False))
            episode_reward = 0.0
            obs = env.reset()
コード例 #11
0
def multiModelPredict(cycle, interactive, *argv):
    if interactive:
        plt.ion()

    figure, axes = plt.subplots(len(argv), 4, figsize=(12, 6))
    figure.tight_layout()

    menv = HedgingEnv()
    menv.reset()
    market = menv.market

    envs = []
    obsrv = []
    models = []
    actions = []

    for model_name in argv:
        env = HedgingEnv()
        env.model_name = model_name
        envs += [env]
        models += [SAC.load(model_name)]
        obsrv += [env.reset(market)]
        actions += [None]

    while True:
        done = False
        for i in range(len(models)):
            actions[i], _states = models[i].predict(obsrv[i])
            obsrv[i], _reward, done, _ = envs[i].step(actions[i])

        if done:
            menv = HedgingEnv()
            menv.reset()
            market = menv.market

            # plot
            for i in range(len(envs)):
                axs = axes[i]
                env = envs[i]
                dashboard_axes(env, axs[0], axs[1], axs[2], axs[3])

            plt.show()
            plt.pause(0.005)  # pause a bit so that plots are updated

            for i in range(len(envs)):
                obsrv[i] = envs[i].reset(market)

    if interactive:
        plt.ioff()
コード例 #12
0
ファイル: show.py プロジェクト: YvesP75/st-iPADS
def show():
    '''
    shows the i-PADS in Streamlit
    :return:
    '''
    env = TwoDimEnv()
    model = SAC.load("longModel")
    st.title('Intelligent PADS')
    st.sidebar.write("d'où est parti le parachute?")
    rho = st.sidebar.slider('à quelle distance?', 0, 500, 0)

    theta = 2 * PI / 360 * st.sidebar.slider('avec quel angle?', 0, 360, 0)
    zed = st.sidebar.slider('à quelle hauteur?', 0, 100, 150)
    pitch = st.sidebar.slider('pitch', 0, 100, 50)

    location = st.sidebar.radio("Lieu",
                                ['Fonsorbes', 'Paris', 'San Francisco'])
    lat_tg = LOC[location]['lat']
    lon_tg = LOC[location]['lon']
    df, df_path, df_col = run_episode(env,
                                      model,
                                      lat_tg,
                                      lon_tg,
                                      rho_init=rho,
                                      theta_init=theta,
                                      zed=zed)

    df_target = pd.DataFrame({'lat': [lat_tg], 'lon': [lon_tg]})
    deck_map = st.empty()
    initial_view_state = pdk.ViewState(latitude=lat_tg,
                                       longitude=lon_tg,
                                       zoom=12,
                                       pitch=pitch)
    deck_map.pydeck_chart(
        pdk.Deck(map_style='mapbox://styles/mapbox/light-v9',
                 initial_view_state=initial_view_state))
    df_pathi = df_path.copy()
    for i in range(zed):
        df_pathi['path'][0] = df_path['path'][0][0:i + 1]
        layers = get_layers(df[i:i + 1], df[0:i], df_target, df_pathi,
                            df_col[0:i + 1])
        deck_map.pydeck_chart(
            pdk.Deck(map_style='mapbox://styles/mapbox/light-v9',
                     initial_view_state=initial_view_state,
                     layers=layers))
        time.sleep(TIMESLEEP)
コード例 #13
0
def test_sac():
    log_dir = f"model_save/best_model_sac_cnn"
    env = ENV(istest=True)
    env.render = True
    env = Monitor(env, log_dir)
    model = SAC.load(log_dir)
    plot_results(f"model_save/")
    for i in range(10):
        state = env.reset()
        while True:
            action = model.predict(state)
            next_state, reward, done, info = env.step(action[0])
            state = next_state
            # print("trying:",i,"action:", action,"now profit:",env.profit)
            if done:
                print('stock', i, ' total profit=', env.profit, ' buy hold=',
                      env.buy_hold)
                break
コード例 #14
0
def play():
    env = gym.make('kuka_iiwa_insertion-v0', use_gui=True)
    model = SAC.load("models/kuka_iiwa_insertion-v0_sac_best_model", env=env)

    obs = env.reset()
    i = 0
    episode_reward = 0.0
    while True:
        i += 1
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = env.step(action)
        episode_reward += rewards
        if i % 10 == 0 or dones:
            print(obs, episode_reward, rewards, info)
        if dones:
            print("=" * 20 + " RESET " + "=" * 20)
            episode_reward = 0
            env.reset()
コード例 #15
0
ファイル: show.py プロジェクト: YvesP75/st-iPADS
def show_print():
    '''
    for debug purposes, allows to check the content of the df passed to the layers
    :return:
    '''
    env = TwoDimEnv()
    model = SAC.load("longModel")
    rho, theta, zed = 100, 0, 10
    lat_tg, lon_tg = LOC['Fonsorbes']['lat'], LOC['Fonsorbes']['lon']
    df, df_path = run_episode(env,
                              model,
                              lat_tg,
                              lon_tg,
                              theta_init=theta,
                              zed=zed)
    df_pathi = df_path.copy()
    for i in range(zed):
        df_temp = pd.DataFrame([{'path': df_path['path'][0][0:i]}])
        df_pathi.update(df_temp)
        print(df_pathi)
コード例 #16
0
ファイル: rl_race.py プロジェクト: helios57/rl-racing
def train_sac():
    latent_dim = 256
    vae = CVAE(latent_dim)
    vae.load_weights('./vae_256/checkpoint')
    env1 = DonkeyVAEEnv(vae, latent_dim, "Helios1")
    # manual_override=None if you don't want to "help" the Agend with w,a,s,d
    # env1 = DonkeyVAEEnv(vae, latent_dim, "Helios1", manual_override=ManualOverride())
    env1.client.collecting = False
    sac = SAC(env=env1,
              policy=MlpPolicy,
              buffer_size=20000,
              learning_starts=0,
              train_freq=20000,
              batch_size=256,
              verbose=2,
              gradient_steps=100,
              learning_rate=0.0005)
    # uncomment if you want to load a model and retrain it
    sac = sac.load("sac/model_sb3", env=env1)
    # sac = sac.load("sac/model_sb3_lake_36", env=env1)
    # sac = sac.load("sac/model_sb3_lake_36_unscaled", env=env1)
    env1.client.hardReset()
    env1.client.initCar()
    env1.client.reset()
    env1.client.restartScene()
    env1.client.hardReset()
    env1.client.initCar()
    env1.client.reset()
    env1.client.collecting = True
    env1.client.telemetrie = []
    while True:
        observation, index = env1.get_observation()
        action = sac.predict(np.asarray([observation]),
                             deterministic=False)[0][0]
        steering, throttle = action[0], action[1]
        env1.client.send_controls(steering * 0.4, throttle)
        # env1.client.send_controls(steering * 0.7, throttle * 0.8)
        print(
            str(index) + " steering:" + str(action[0]) + " throttle:" +
            str(action[1]) + " speed:" +
            str(env1.client.telemetrie[index].speed))
コード例 #17
0
def _load_sac(agent, args, config, policy):
    model = None
    if args.load_model == '':
        model = SAC("MlpPolicy",
                    policy_kwargs=policy,
                    env=agent,
                    verbose=config.sac_verbose(),
                    batch_size=config.sac_batch_size(),
                    buffer_size=config.sac_buffer_size(),
                    learning_starts=config.sac_learning_starts(),
                    gradient_steps=config.sac_gradient_steps(),
                    train_freq=config.sac_train_freq(),
                    ent_coef=config.sac_ent_coef(),
                    learning_rate=config.sac_learning_rate(),
                    tensorboard_log="tblog",
                    gamma=config.sac_gamma(),
                    tau=config.sac_tau(),
                    use_sde_at_warmup=config.sac_use_sde_at_warmup(),
                    use_sde=config.sac_use_sde(),
                    sde_sample_freq=config.sac_sde_sample_freq(),
                    n_episodes_rollout=1)
    else:
        model = SAC.load(args.load_model,
                         env=agent,
                         policy_kwargs=policy,
                         verbose=config.sac_verbose(),
                         batch_size=config.sac_batch_size(),
                         buffer_size=config.sac_buffer_size(),
                         learning_starts=config.sac_learning_starts(),
                         gradient_steps=config.sac_gradient_steps(),
                         train_freq=config.sac_train_freq(),
                         ent_coef=config.sac_ent_coef(),
                         learning_rate=config.sac_learning_rate(),
                         tensorboard_log="tblog",
                         gamma=config.sac_gamma(),
                         tau=config.sac_tau(),
                         use_sde_at_warmup=config.sac_use_sde_at_warmup(),
                         use_sde=config.sac_use_sde(),
                         sde_sample_freq=config.sac_sample_freq(),
                         n_episodes_rollout=1)
    return model
コード例 #18
0
def main():
    as_gdads = True
    name = "pointmass"
    drop_abs_position = True

    dads_env_fn = envs_fns[name]
    conf: Conf = CONFS[name]

    dict_env = as_dict_env(dads_env_fn())
    dict_env = TimeLimit(dict_env, max_episode_steps=conf.ep_len)
    if drop_abs_position:
        dict_env = DropGoalEnvsAbsoluteLocation(dict_env)
    if as_gdads:
        flat_env = SkillWrapper(env=dict_env, skill_reset_steps=conf.ep_len // 2)
    else:
        flat_obs_content = ["observation", "desired_goal", "achieved_goal"]
        if drop_abs_position:
            flat_obs_content.remove("achieved_goal")  # Because always 0 vector
        flat_env = FlattenObservation(FilterObservation(dict_env, filter_keys=flat_obs_content))

    flat_env = TransformReward(flat_env, f=lambda r: r*conf.reward_scaling)
    flat_env = Monitor(flat_env)

    filename = f"modelsCommandSkills/{name}-gdads{as_gdads}"
    if os.path.exists(filename + ".zip"):
        sac = SAC.load(filename, env=flat_env)
        if as_gdads:
            flat_env.load(filename)
    else:
        sac = SAC("MlpPolicy", env=flat_env, verbose=1, learning_rate=conf.lr,
                  tensorboard_log=f"{filename}-tb", buffer_size=10000)
        train(model=sac, conf=conf, save_fname=filename)
        if as_gdads:
            flat_env.save(filename)

    if as_gdads:
        flat_env.set_sac(sac)
        eval_dict_env(dict_env=dict_env,
                      model=flat_env,
                      ep_len=conf.ep_len)
    show(model=sac, env=flat_env, conf=conf)
コード例 #19
0
def singleModelPredict(model_name, cycle=5, interactive=True):
    if interactive:
        plt.ion()

    env = HedgingEnv()
    #env.mu = -0.5
    model = SAC.load(model_name)
    obs = env.reset()
    cnt = 0

    while True:
        reward_history = []
        action, _states = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_history += [reward]
        if done:
            env.render()
            obs = env.reset()
            cnt += 1
            if cnt > cycle:
                break

    if interactive:
        plt.ioff()
コード例 #20
0
ファイル: SAC-play.py プロジェクト: xrayfinding/laikago_robot
    parser.add_argument('-m', '--run_mode', default='train')
    parser.add_argument('-s', '--simulator', default='mujoco')

    args = parser.parse_args()
    version = args.version
    task_name = args.task_name
    run_mode = args.run_mode
    simulator = args.simulator

    if args.load_version is None:
        best_model_save_path = './{}/{}/SAC-v{}/logs/best_model.zip'.format(simulator, task_name, version)
    else:
        best_model_save_path = './{}/{}/SAC-v{}/logs/best_model.zip'.format(simulator, task_name, args.load_version)

    env = build_env(task_name, version, run_mode, simulator, visual=True, ctrl_delay=True)
    model = SAC.load(best_model_save_path, device=torch.device('cuda:0'))

    obs = env.reset()
    total_reward = 0
    for i in range(10000):
        action, _states = model.predict(obs, deterministic=True)
        # action = np.array([-10, 30, -75,
        #            10, 30, -75,
        #            -10, 50, -75,
        #            10, 50, -75]) * np.pi / 180
        # action = env.action_space.sample()
        obs, reward, done, info = env.step(action)

        print(info['energy'])

        total_reward += reward
コード例 #21
0
from stable_baselines3 import PPO, SAC
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

from common import common
import common.gym_interface as gym_interface

if __name__ == "__main__":
    hyperparams = common.load_hyperparameters(conf_name="SAC")

    venv = DummyVecEnv([gym_interface.make_env(robot_body=300)])

    keys_remove = ["normalize", "n_envs", "n_timesteps", "policy"]
    for key in keys_remove:
        if key in hyperparams:
            del hyperparams[key]

    model = SAC('MlpPolicy', venv, verbose=1, seed=common.seed, **hyperparams)
    model.save("output_data/tmp/tmp")

    model = SAC.load("output_data/tmp/tmp.zip")
    model = SAC.load("output_data/models/best_model.zip")
コード例 #22
0
ファイル: ttt_action.py プロジェクト: tpvt99/sbcs5478
# Step 3.b. To make Vectorized Environment to be able to use Normalize or FramStack (Optional)
env = make_vec_env(lambda: env, n_envs=1)
# Step 3.b Passing through Normalization and stack frame (Optional)

env = VecFrameStack(
    env,
    n_stack=custom_params['FRAME_STACK'])  # Use 1 for now because we use image
if not custom_params['USING_VAE']:
    env = VecTransposeImage(env)  # Uncomment if using 3d obs
if custom_params['USING_NORMALIZATION']:
    env = VecNormalize.load(osp.join(results_dir, "vec_normalization.pkl"),
                            env)

# Load the agent
if custom_params['algo'] == 'sac':
    model = SAC.load(osp.join(results_dir, "best_model", "best_model.zip"))
elif custom_params['algo'] == 'a2c':
    model = A2C.load(osp.join(results_dir, "best_model", "best_model.zip"))
elif custom_params['algo'] == 'dqn':
    model = DQN.load(osp.join(results_dir, "best_model", "best_model.zip"))
elif custom_params['algo'] == 'ppo':
    model = PPO.load(osp.join(results_dir, "best_model", "best_model.zip"))

else:
    raise ValueError("Error model")

# Load the saved statistics
#  do not update them at test time
env.training = False
# reward normalization is not needed at test time
env.norm_reward = False
コード例 #23
0
ファイル: DRL_battery.py プロジェクト: jajimer/energym
            gcloud.read_from_bucket(client, bucket_name, model_path)
            model_path = './' + model_path
        else:
            model_path = args.model

        model = None
        if args.algorithm == 'DQN':
            model = DQN.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'DDPG':
            model = DDPG.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'A2C':
            model = A2C.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'PPO':
            model = PPO.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'SAC':
            model = SAC.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'TD3':
            model = TD3.load(model_path, tensorboard_log=args.tensorboard)
        else:
            raise RuntimeError('Algorithm specified is not registered.')

        model.set_env(env)

    # ---------------------------------------------------------------------------- #
    #       Calculating total training timesteps based on number of episodes       #
    # ---------------------------------------------------------------------------- #
    n_timesteps_episode = env.simulator._eplus_one_epi_len / \
        env.simulator._eplus_run_stepsize
    timesteps = args.episodes * n_timesteps_episode - 1

    # ---------------------------------------------------------------------------- #
コード例 #24
0
from stable_baselines3 import SAC
import pybullet_envs
import gym
from stable_baselines3.common.callbacks import EvalCallback

ENV_NAME = 'HalfCheetahBulletEnv-v0'
TIME_STEPS = 100000

env = gym.make(ENV_NAME)
eval_env = gym.make(ENV_NAME)

model = SAC.load("logs/best_model")


max_v = 0
min_v = 0

total_reward = 0
env.render()
obs = env.reset()
for i in range(100000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    total_reward += reward
    env.render()
    if done:
      obs = env.reset()
      print('Test reward is {:.3f}.'.format(total_reward))
      total_reward = 0
env.close()
コード例 #25
0
                         simulator,
                         visual=False,
                         ctrl_delay=True)

    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=best_model_save_path,
                                 log_path=log_path,
                                 eval_freq=5000,
                                 deterministic=True,
                                 render=False)
    policy_kwargs = dict(activation_fn=torch.nn.ReLU, net_arch=net_arch)

    if args.load_version is not None:
        best_model_dir = './{}/{}/SAC-v{}/logs/best_model.zip'.format(
            simulator, task_name, args.load_version)
        model = SAC.load(best_model_dir, device=torch.device('cuda:0'))
        model.set_env(env)
        model.tensorboard_log = tensorboard_log
        model.num_timesteps = 0
        model.learning_starts = args.learning_starts
        model.buffer_size = args.buffer_size
        model.learning_rate = learning_rate
        if ent_coef == 'auto':
            init_value = 1.0
            model.log_ent_coef = torch.log(
                torch.ones(1, device=model.device) *
                init_value).requires_grad_(True)
            model.ent_coef_optimizer = torch.optim.Adam(
                [model.log_ent_coef], lr=model.lr_schedule(1))

    else:
コード例 #26
0
def advanced_saving_and_loading_example():
    # Advanced Saving and Loading.

    from stable_baselines3.sac.policies import MlpPolicy

    # Create the model, the training environment and the test environment (for evaluation).
    model = SAC('MlpPolicy',
                'Pendulum-v1',
                verbose=1,
                learning_rate=1e-3,
                create_eval_env=True)

    # Evaluate the model every 1000 steps on 5 test episodes and save the evaluation to the "logs/" folder.
    model.learn(6000,
                eval_freq=1000,
                n_eval_episodes=5,
                eval_log_path="./logs/")

    # Save the model.
    model.save("sac_pendulum")

    # The saved model does not contain the replay buffer.
    loaded_model = SAC.load("sac_pendulum")
    print(
        f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer"
    )

    # Now save the replay buffer too.
    model.save_replay_buffer("sac_replay_buffer")

    # Load it into the loaded_model.
    loaded_model.load_replay_buffer("sac_replay_buffer")

    # Now the loaded replay is not empty anymore.
    print(
        f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer"
    )

    # Save the policy independently from the model.
    # Note: if you don't save the complete model with 'model.save()'
    # you cannot continue training afterward.
    policy = model.policy
    policy.save("sac_policy_pendulum")

    # Retrieve the environment.
    env = model.get_env()

    # Evaluate the policy.
    mean_reward, std_reward = evaluate_policy(policy,
                                              env,
                                              n_eval_episodes=10,
                                              deterministic=True)

    print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

    # Load the policy independently from the model.
    saved_policy = MlpPolicy.load("sac_policy_pendulum")

    # Evaluate the loaded policy.
    mean_reward, std_reward = evaluate_policy(saved_policy,
                                              env,
                                              n_eval_episodes=10,
                                              deterministic=True)

    print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
コード例 #27
0
 for i in range(start_loop, end_loop):
     print("EVAL ", i)
     avg_dis_reward_run = []
     for j in range(0, 10):
         print("SEED 0")
         # lambd = np.load(f"./{args.folder}/buffers/lambda_{args.algo}_{j}.npy")
         # N = np.load(f"./{args.folder}/buffers/N_{args.algo}_{j}.npy")
         model_name = f"./{args.folder}/models/model_{args.algo}_{j}_{i}"
         #print ("Lambd N i ", lambd[i], N[i])
         env.set_N(int(N[i]), list(lambd[i]))
         if args.algo == 0:
             model = PPO.load(model_name, env)
         elif args.algo == 1:
             model = A2C.load(model_name, env)
         elif args.algo == 2:
             model = SAC.load(model_name, env)
         elif args.algo == 3:
             thres_vec = np.load(
                 f"./{args.folder}/buffers/thresvec_{args.env_name}_{j}.npy"
             )
             model.set_threshold_vec(thres_vec[i])
         avg_dis_reward = 0.0
         for k in range(100):
             env.seed(k)
             obs = env.reset()
             reward_traj = []
             dis_reward = 0.0
             for t in range(int(1e3)):
                 if args.algo == 3:
                     action = model.select_action(np.array(obs), eval_=True)
                 else:
コード例 #28
0
ファイル: train.py プロジェクト: apigott/modelicagym
def eval_model(env, model_name):
    env.reset()
    env.reset_dymola()

    mode = 'load'

    if mode == 'load':
        model = SAC.load(model_name, env=env)
    else:
        model = SAC(MlpPolicy, env, learning_rate=10**-4, verbose=1, tensorboard_log='tensorboard_log')
        tic = time.time()

        env.reset()
        model.learn(10000, reset_num_timesteps=False)
        model.save("IEEE9_5k_v4")

        toc = time.time()
        print(toc-tic)

    obs = env.reset()
    actions = []
    rewards = []
    for _ in range(250):
        action = model.predict(obs)[0]
        actions += action.tolist()
        obs, reward, done, info = env.step(action)
        rewards += [reward]
    volt_norm = []
    legend = []
    fig, ax = plt.subplots(5,3,figsize=(40,30))
    for i in range(3):
        for j in range(3):
            bus = 1 + 3*i + j
            bus_volt = np.array(env.debug_data[f'iEEE_14_Buses.B{bus}.V'])-1.0
            volt_norm += [bus_volt]
            ax[i][j].plot(env.debug_data['iEEE_14_Buses.my_time'], bus_volt, color='r')
            ax[i][j].set_ylabel('Voltage Dev')
            ax[i][j].set_title(f'Bus {bus}')

    legend += ['RL Agent']
    gen = ['G2.gENROU.P','G1.gENSAL.P']
    for j in range(2):
        ax[3][j].plot(env.debug_data['iEEE_14_Buses.my_time'], env.debug_data[f'iEEE_14_Buses.{gen[j]}'], color='r')
        ax[3][j].set_xlabel('Time (sec)')
        ax[3][j].set_title(f'{gen[j]} Output')
        ax[4][j].plot(np.arange(250), actions[j::2], color='r')
    ax[3][2].plot(env.debug_data['iEEE_14_Buses.my_time'],np.divide(np.cumsum(np.linalg.norm(volt_norm, axis=0)),np.clip(env.debug_data['iEEE_14_Buses.my_time'], 1, np.inf)), color='r')
    ax[3][2].plot(env.debug_data['iEEE_14_Buses.my_time'],np.linalg.norm(volt_norm, axis=0), color='r')
    ax[4][2].plot(np.arange(250), rewards, color='r')

    env.reset()
    actions = []
    rewards = []
    for _ in range(250):
        action = env.action_space.sample()
        actions += action.tolist()
        obs, reward, done, info = env.step(action)
        rewards += [reward]
    volt_norm = []
    for i in range(3):
        for j in range(3):
            bus = 1 + 3*i + j
            bus_volt = np.array(env.debug_data[f'iEEE_14_Buses.B{bus}.V'])-1.0
            volt_norm += [bus_volt]
            ax[i][j].plot(env.debug_data['iEEE_14_Buses.my_time'], bus_volt, color='b')
    legend += ['Randomized']
    for j in range(2):
        ax[3][j].plot(env.debug_data['iEEE_14_Buses.my_time'], env.debug_data[f'iEEE_14_Buses.{gen[j]}'], color='b')
        ax[4][j].plot(np.arange(250), actions[j::2], color='b')
    ax[3][2].plot(env.debug_data['iEEE_14_Buses.my_time'],np.linalg.norm(volt_norm, axis=0), color='b')
    ax[3][2].plot(env.debug_data['iEEE_14_Buses.my_time'],np.divide(np.cumsum(np.linalg.norm(volt_norm, axis=0)),np.clip(env.debug_data['iEEE_14_Buses.my_time'], 1, np.inf)), color='b')
    ax[4][2].plot(np.arange(250), rewards, color='b')

    env.reset()
    actions = []
    rewards = []
    for _ in range(250):
        action = env.default_action # null action
        actions += action
        obs, reward, done, info = env.step(action)
        rewards += [reward]

    volt_norm = []
    for i in range(3):
        for j in range(3):
            bus = 1 + 3*i + j
            bus_volt = np.array(env.debug_data[f'iEEE_14_Buses.B{bus}.V'])-1.0
            volt_norm += [bus_volt]
            ax[i][j].plot(env.debug_data['iEEE_14_Buses.my_time'], bus_volt, color='g')
    legend += ['Do Nothing']
    for j in range(2):
        ax[3][j].plot(env.debug_data['iEEE_14_Buses.my_time'], env.debug_data[f'iEEE_14_Buses.{gen[j]}'], color='g')
        ax[4][j].plot(np.arange(250), actions[j::2], color='g')
    ax[3][2].plot(env.debug_data['iEEE_14_Buses.my_time'],np.divide(np.cumsum(np.linalg.norm(volt_norm, axis=0)),np.clip(env.debug_data['iEEE_14_Buses.my_time'], 1, np.inf)), color='g')
    ax[3][2].plot(env.debug_data['iEEE_14_Buses.my_time'],np.linalg.norm(volt_norm, axis=0), color='g')
    ax[0][2].legend(legend)
    ax[4][2].plot(np.arange(250), rewards, color='g')

    env.dymola.close()

    plt.savefig(model_name)
    plt.show()
    return
コード例 #29
0
    #### Load the model from file ##############################
    algo = ARGS.exp.split("-")[2]

    if os.path.isfile(ARGS.exp + '/success_model.zip'):
        path = ARGS.exp + '/success_model.zip'
    elif os.path.isfile(ARGS.exp + '/best_model.zip'):
        path = ARGS.exp + '/best_model.zip'
    else:
        print("[ERROR]: no model under the specified path", ARGS.exp)
    if algo == 'a2c':
        model = A2C.load(path)
    if algo == 'ppo':
        model = PPO.load(path)
    if algo == 'sac':
        model = SAC.load(path)
    if algo == 'td3':
        model = TD3.load(path)
    if algo == 'ddpg':
        model = DDPG.load(path)

    #### Parameters to recreate the environment ################
    env_name = ARGS.exp.split("-")[1] + "-aviary-v0"
    OBS = ObservationType.KIN if ARGS.exp.split(
        "-")[3] == 'kin' else ObservationType.RGB
    if ARGS.exp.split("-")[4] == 'rpm':
        ACT = ActionType.RPM
    elif ARGS.exp.split("-")[4] == 'dyn':
        ACT = ActionType.DYN
    elif ARGS.exp.split("-")[4] == 'pid':
        ACT = ActionType.PID
コード例 #30
0
ファイル: check.py プロジェクト: tail-island/sally
import os

from funcy import last
from glob import glob
from self_driving import SelfDriving
from stable_baselines3 import SAC
from stable_baselines3.common.evaluation import evaluate_policy

env = SelfDriving()

model_path = last(sorted(glob('log/*.zip'), key=lambda f: os.stat(f).st_mtime))
model = SAC.load(model_path, env)

print(model_path)

reward_mean, _ = evaluate_policy(model,
                                 env,
                                 n_eval_episodes=1,
                                 render=True,
                                 warn=False)

print(f'reward: {reward_mean:.02f}')

for _ in range(10):
    env.seed(None)  # 乱数シードをNone(現在時刻を使う)に設定します。

    observation = env.reset()
    done = False

    while not done:
        action, _ = model.predict(observation, deterministic=True)