コード例 #1
0
def main(args):
    # 1. Start a W&B run
    wandb.init(project='pearl', entity='adlr-ss-21-05')
    wandb.config.update(args)
    print("wandb name: ", wandb.run.name)

    log_dir = "tmp/" + wandb.run.name + "/"
    os.makedirs(log_dir, exist_ok=True)

    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                check_log=1,
                                                log_dir=log_dir,
                                                model_name=wandb.run.name)

    env = gym.make('kuka_iiwa_insertion-v0',
                   use_gui=False,
                   steps_per_action=args.steps_per_action,
                   max_steps=args.max_steps,
                   action_step_size=args.action_step_size)
    env = Monitor(env, log_dir)

    model = SAC("MlpPolicy",
                env,
                verbose=args.verbosity,
                train_freq=(args.train_freq_num, args.train_freq_type),
                batch_size=args.batch_size)

    i = 0
    save_interval = 1000000
    while True:
        i += save_interval
        model.learn(total_timesteps=save_interval, callback=callback)
コード例 #2
0
def test_offpolicy_normalization(model_class, online_sampling):

    if online_sampling and model_class != HerReplayBuffer:
        pytest.skip()

    make_env_ = make_dict_env if model_class == HerReplayBuffer else make_env
    env = DummyVecEnv([make_env_])
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0, clip_reward=10.0)

    eval_env = DummyVecEnv([make_env_])
    eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=False, clip_obs=10.0, clip_reward=10.0)

    if model_class == HerReplayBuffer:
        model = SAC(
            "MultiInputPolicy",
            env,
            verbose=1,
            learning_starts=100,
            policy_kwargs=dict(net_arch=[64]),
            replay_buffer_kwargs=dict(
                max_episode_length=100,
                online_sampling=online_sampling,
                n_sampled_goal=2,
            ),
            replay_buffer_class=HerReplayBuffer,
            seed=2,
        )
    else:
        model = model_class("MlpPolicy", env, verbose=1, learning_starts=100, policy_kwargs=dict(net_arch=[64]))

    model.learn(total_timesteps=150, eval_env=eval_env, eval_freq=75)
    # Check getter
    assert isinstance(model.get_vec_normalize_env(), VecNormalize)
コード例 #3
0
def test_goal_selection_strategy(goal_selection_strategy, online_sampling):
    """
    Test different goal strategies.
    """
    env = BitFlippingEnv(continuous=True)

    normal_action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1))

    model = SAC(
        "MultiInputPolicy",
        env,
        replay_buffer_class=HerReplayBuffer,
        replay_buffer_kwargs=dict(
            goal_selection_strategy=goal_selection_strategy,
            online_sampling=online_sampling,
            max_episode_length=10,
            n_sampled_goal=2,
        ),
        train_freq=4,
        gradient_steps=1,
        policy_kwargs=dict(net_arch=[64]),
        learning_starts=100,
        buffer_size=int(1e5),
        action_noise=normal_action_noise,
    )
    assert model.action_noise is not None
    model.learn(total_timesteps=150)
コード例 #4
0
def param_buff():
    res1 = [0, 0.5, 1, 1.5, 2]
    res2 = [1000, 2000, 10000, 100000, 10**6]
    res3 = [500, 500, 5000, 5000, 5000]

    for j, k in zip(res2, res3):
        for i in res1:
            model = SAC(
                "MlpPolicy",
                "Pendulum-v0",
                policy_kwargs=dict(net_arch=[64, 64]),
                learning_starts=k,
                verbose=1,
                create_eval_env=True,
                buffer_size=j,
                ent_coef=i,
                action_noise=NormalActionNoise(np.zeros(1), np.zeros(1))  #,
                #tensorboard_log="./sac_pendulum_tensorboard/"
            )
            eval_env = gym.make('Pendulum-v0')
            eval_callback = EvalCallback(eval_env,
                                         best_model_save_path='./logs/',
                                         log_path='./logs/alpha2c',
                                         eval_freq=250,
                                         deterministic=True,
                                         render=False)
            model.learn(total_timesteps=20000, callback=eval_callback)

    return res1, res2
コード例 #5
0
def main(do_render: bool, seed: int, as_gdads: bool, name: str,
         do_train: bool):
    drop_abs_position = True

    conf: Conf = CONFS[name]
    dict_env = get_env(name=name,
                       drop_abs_position=drop_abs_position,
                       is_training=True)
    if as_gdads:
        flat_env = SkillWrapper(env=dict_env)
    else:
        flat_env = flatten_env(dict_env, drop_abs_position)
    flat_env = TransformReward(flat_env, f=lambda r: r * conf.reward_scaling)
    flat_env = Monitor(flat_env)

    dict_env = get_env(name=name,
                       drop_abs_position=drop_abs_position,
                       is_training=False)
    if as_gdads:
        use_slider = False
        if use_slider:
            eval_env = SliderWrapper(env=dict_env)
        else:
            eval_env = GDADSEvalWrapper(dict_env,
                                        sw=BestSkillProvider(flat_env))
    else:
        eval_env = flatten_env(dict_env=dict_env,
                               drop_abs_position=drop_abs_position)

    filename = f"modelsCommandSkills/{name}/asGDADS{as_gdads}/resamplingFalse_goalSpaceTrue-seed-{seed}"
    if os.path.exists(filename + ".zip"):
        sac = SAC.load(filename + ".zip", env=flat_env)
        print(f"loaded model {filename}")
        if as_gdads:
            flat_env.load(filename)
    else:
        sac = SAC("MlpPolicy",
                  env=flat_env,
                  verbose=1,
                  learning_rate=conf.lr,
                  tensorboard_log=filename,
                  buffer_size=conf.buffer_size,
                  batch_size=conf.batch_size,
                  gamma=gamma(conf.ep_len),
                  learning_starts=100 * conf.ep_len,
                  policy_kwargs=dict(log_std_init=-3,
                                     net_arch=[conf.layer_size] * 2),
                  seed=seed,
                  device="cuda",
                  train_freq=4)
    if do_train:
        train(model=sac, conf=conf, save_fname=filename, eval_env=eval_env)
    if do_render:
        show(model=sac, env=eval_env, conf=conf)
    do_eval = not do_train and not do_render
    if do_eval:
        results = ant_grid_evaluation(model=sac,
                                      env=eval_env,
                                      episode_len=conf.ep_len)
        dump_ant_grid_evaluation(results)
コード例 #6
0
def test_full_replay_buffer():
    """
    Test if HER works correctly with a full replay buffer when using online sampling.
    It should not sample the current episode which is not finished.
    """
    n_bits = 4
    env = BitFlippingEnv(n_bits=n_bits, continuous=True)

    # use small buffer size to get the buffer full
    model = SAC(
        "MultiInputPolicy",
        env,
        replay_buffer_class=HerReplayBuffer,
        replay_buffer_kwargs=dict(
            n_sampled_goal=2,
            goal_selection_strategy="future",
            online_sampling=True,
            max_episode_length=n_bits,
        ),
        gradient_steps=1,
        train_freq=4,
        policy_kwargs=dict(net_arch=[64]),
        learning_starts=1,
        buffer_size=20,
        verbose=1,
        seed=757,
    )

    model.learn(total_timesteps=100)
コード例 #7
0
ファイル: test_run.py プロジェクト: wmmc88/stable-baselines3
def test_n_critics(n_critics):
    # Test SAC with different number of critics, for TD3, n_critics=1 corresponds to DDPG
    model = SAC("MlpPolicy",
                "Pendulum-v0",
                policy_kwargs=dict(net_arch=[64, 64], n_critics=n_critics),
                learning_starts=100,
                verbose=1)
    model.learn(total_timesteps=500)
コード例 #8
0
def multiprocessing_with_off_policy_algorithms_example():
    # Multiprocessing with off-policy algorithms.

    env = make_vec_env("Pendulum-v1", n_envs=4, seed=0)

    # We collect 4 transitions per call to 'env.step()' and performs 2 gradient steps per call to 'env.step()'
    # if gradient_steps=-1, then we would do 4 gradients steps per call to 'env.step()'.
    model = SAC("MlpPolicy", env, train_freq=1, gradient_steps=2, verbose=1)
    model.learn(total_timesteps=10_000)
コード例 #9
0
def main(trained_agent_type, zoom_level):
    # mapping lunar lander controls to "W" (main engine), "A" (left engine), "D" (right engine)
    keys_to_action = {
        (ord('w'), ): 2,
        (ord('a'), ): 1,
        (ord('d'), ): 3,
        (ord('d'), ord('w')): 3,
        (ord('a'), ord('w')): 1,
    }

    # Checking for various trained_agent_type that might be selected.
    # 0: The human has full control.
    # 1: Trained with Sensor human and intervention penalty of 1
    # 2: Trained with Noisy human and intervention penalty of 0.15
    # 3: Trained with Noisy human and intervention penalty of 0.75
    # 4: Ensemble of 1, 2, and 3. i.e. an action is sampled uniformly randomly from one of those agents at each timestep
    if trained_agent_type == 0:
        # this agent doesn't actually do anything, just a placeholder to satisfy HITLSBLunarLanderContEval's API
        hitl_agent = SAC.load('savedModels/sac_lunar_hitl_1p_sensor00.zip')
        eval_env = HITLSBLunarLanderContEval('LunarLanderContinuous-v2',
                                             hitl_agent,
                                             do_not_intervene=True)
        play(eval_env,
             zoom=zoom_level,
             fps=60,
             keys_to_action=keys_to_action,
             callback=print_rewards_callback)
    elif trained_agent_type == 4:
        hitl_agent1 = SAC.load('savedModels/sac_lunar_hitl_1p_sensor00.zip')
        hitl_agent2 = SAC.load('savedModels/sac_lunar_hitl_015p_noisy085.zip')
        hitl_agent3 = SAC.load('savedModels/sac_lunar_hitl_075p_noisy085.zip')
        eval_env = HITLSBLunarLanderContEval(
            'LunarLanderContinuous-v2',
            [hitl_agent1, hitl_agent2, hitl_agent3])
        play(eval_env,
             zoom=zoom_level,
             fps=60,
             keys_to_action=keys_to_action,
             callback=print_rewards_callback)
    else:
        if trained_agent_type == 1:
            HITL_LUNAR_AGENT_PATH = 'savedModels/sac_lunar_hitl_1p_sensor00.zip'
        elif trained_agent_type == 2:
            HITL_LUNAR_AGENT_PATH = 'savedModels/sac_lunar_hitl_015p_noisy085.zip'
        else:
            HITL_LUNAR_AGENT_PATH = 'savedModels/sac_lunar_hitl_075p_noisy085.zip'

        # load a saved human in the loop agent for LunarLander
        hitl_agent = SAC.load(HITL_LUNAR_AGENT_PATH)
        # create an instance of an evaluation environment, which takes in human actions in its "step" function
        eval_env = HITLSBLunarLanderContEval('LunarLanderContinuous-v2',
                                             hitl_agent)
        play(eval_env,
             zoom=zoom_level,
             fps=60,
             keys_to_action=keys_to_action,
             callback=print_rewards_callback)
コード例 #10
0
def test_sac(ent_coef):
    model = SAC('MlpPolicy',
                'Pendulum-v0',
                policy_kwargs=dict(net_arch=[64, 64]),
                learning_starts=100,
                verbose=1,
                create_eval_env=True,
                ent_coef=ent_coef,
                action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)))
    model.learn(total_timesteps=1000, eval_freq=500)
コード例 #11
0
def train(model: SAC, conf: Conf, save_fname: str, eval_env):
    model.env.reset()
    model.learn(total_timesteps=conf.ep_len * conf.num_episodes,
                log_interval=10,
                callback=[
                    eval_cb(env=eval_env, conf=conf, save_fname=save_fname),
                    LogDeltaStatistics(n_steps=conf.ep_len),
                    LogDeltasHistogram(env=model.env,
                                       freq_in_steps=25 * conf.ep_len)
                ],
                reset_num_timesteps=False)
コード例 #12
0
    def __init__(self, env, hyperparameters=DEFAULT_HYPERPARAMETERS):
        self.P = hyperparameters

        if self.P["model_class"] == "dqn":
            from stable_baselines3 import DQN
            self.model = DQN('MlpPolicy', env, verbose=self.P["verbose"])
            self.model_class = DQN

        elif self.P["model_class"] == "a2c":
            from stable_baselines3 import A2C
            from stable_baselines3.a2c import MlpPolicy
            self.model = A2C(MlpPolicy, env, verbose=self.P["verbose"])
            self.model_class = A2C

        elif self.P["model_class"] == "ddpg":
            from stable_baselines3 import DDPG
            from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
            n_actions = env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            self.model = DDPG('MlpPolicy',
                              env,
                              action_noise=action_noise,
                              verbose=self.P["verbose"])
            self.model_class = DDPG

        elif self.P["model_class"] == "td3":
            from stable_baselines3 import TD3
            from stable_baselines3.td3.policies import MlpPolicy
            from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
            n_actions = env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            self.model = TD3(MlpPolicy,
                             env,
                             action_noise=action_noise,
                             verbose=self.P["verbose"])
            self.model_class = TD3

        elif self.P["model_class"] == "ppo":
            from stable_baselines3 import PPO
            from stable_baselines3.ppo import MlpPolicy
            self.model = PPO(MlpPolicy, env, verbose=self.P["verbose"])
            self.model_class = PPO

        elif self.P["model_class"] == "sac":
            from stable_baselines3 import SAC
            from stable_baselines3.sac import MlpPolicy
            self.model = SAC(MlpPolicy, env, verbose=self.P["verbose"])
            self.model_class = SAC

        else:
            raise NotImplementedError()
コード例 #13
0
ファイル: test_run.py プロジェクト: mjlbach/stable-baselines3
def test_train_freq_fail(train_freq):
    with pytest.raises(ValueError):
        model = SAC(
            "MlpPolicy",
            "Pendulum-v0",
            policy_kwargs=dict(net_arch=[64, 64], n_critics=1),
            learning_starts=100,
            buffer_size=10000,
            verbose=1,
            train_freq=train_freq,
        )
        model.learn(total_timesteps=250)
コード例 #14
0
ファイル: test_run.py プロジェクト: mjlbach/stable-baselines3
def test_sac(ent_coef):
    model = SAC(
        "MlpPolicy",
        "Pendulum-v0",
        policy_kwargs=dict(net_arch=[64, 64]),
        learning_starts=100,
        verbose=1,
        create_eval_env=True,
        buffer_size=250,
        ent_coef=ent_coef,
        action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)),
    )
    model.learn(total_timesteps=300, eval_freq=250)
コード例 #15
0
def main():
    """
    # Example with a simple Dummy vec env
    env = gym.envs.make('panda-ip-reach-v0', renders= True)
    env = DummyVecEnv([lambda: env])
    """
    print("Env created !")

    env = PandaReachGymEnv(renders=True)

    env.render(mode='rgb_array')

    model = SAC.load("sac_panda_reach")
    print("model loaded !")

    while True:
        obs, done = env.reset(), False
        print("===================================")
        print("obs")
        print(obs)
        episode_rew = 0
        #while not done:
        for i in range(50):
            env.render(mode='rgb_array')
            action, _states = model.predict(obs)
            obs, rew, done, info = env.step(action)
            episode_rew += rew
            if done:
                break
        print("Episode reward", episode_rew)
コード例 #16
0
def test_save_load_pytorch_var(tmp_path):
    model = SAC("MlpPolicy", "Pendulum-v0", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1))
    model.learn(200)
    save_path = str(tmp_path / "sac_pendulum")
    model.save(save_path)
    env = model.get_env()
    ent_coef_before = model.log_ent_coef

    del model

    model = SAC.load(save_path, env=env)
    assert th.allclose(ent_coef_before, model.log_ent_coef)
    model.learn(200)
    ent_coef_after = model.log_ent_coef
    # Check that the entropy coefficient is still optimized
    assert not th.allclose(ent_coef_before, ent_coef_after)
コード例 #17
0
 def create_model(env, algorithm, save_path):
     # the noise object
     n_actions = env.action_space.shape[-1]
     action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                 sigma=float(0.2) *
                                                 np.ones(n_actions),
                                                 theta=0.15)
     if algorithm == "ddpg":
         return DDPG(DDPG_MlpPolicy,
                     env,
                     learning_rate=0.001,
                     buffer_size=1000000,
                     batch_size=64,
                     tau=0.001,
                     gamma=0.99,
                     train_freq=(10, "step"),
                     action_noise=action_noise,
                     policy_kwargs=dict(optimizer_class=th.optim.AdamW),
                     tensorboard_log=save_path)
     elif algorithm == "td3":
         return TD3(TD3_MlpPolicy,
                    env,
                    action_noise=action_noise,
                    tensorboard_log=save_path)
     elif algorithm == "sac":
         return SAC(SAC_MlpPolicy,
                    env,
                    action_noise=action_noise,
                    tensorboard_log=save_path)
     else:
         raise Exception("--> Alican's LOG: Unknown agent type!")
コード例 #18
0
def sac(env,
        hyper,
        policy="MlpPolicy",
        verbose=0,
        tensorboard_log=None,
        seed=0,
        use_sde=True,
        device="auto"):

    policy_kwargs = make_policy_kwargs(hyper, "sac")
    model = SAC(
        'MlpPolicy',
        env,
        verbose=verbose,
        tensorboard_log=tensorboard_log,
        seed=seed,
        use_sde=use_sde,
        learning_rate=hyper['params_lr'],
        gamma=hyper['params_gamma'],
        batch_size=np.int(hyper['params_batch_size']),
        buffer_size=np.int(hyper['params_buffer_size']),
        learning_starts=np.int(hyper['params_learning_starts']),
        train_freq=np.int(hyper['params_train_freq']),
        tau=hyper['params_tau'],
        gradient_steps=np.int(
            hyper['params_train_freq']),  # tuner assumes this
        policy_kwargs=policy_kwargs,
        device=device)
    return model
コード例 #19
0
def get_perf(i):
    model = SAC("MlpPolicy",
                "Pendulum-v0",
                policy_kwargs=dict(net_arch=[64, 64]),
                learning_starts=5e3,
                verbose=1,
                create_eval_env=True,
                buffer_size=1000000,
                ent_coef=0.2,
                action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)),
                seed=42)
    saved_policy = MlpPolicy.load(
        "Pendulum-v0#test4SAC#custom#None#{}.zip".format(i))
    mean_reward, std_reward = evaluate_policy(saved_policy,
                                              model.get_env(),
                                              n_eval_episodes=900)
    return mean_reward, std_reward
コード例 #20
0
 def __init__(self, algorithm: str, checkpoint_path: str):
     if algorithm == 'ppo':
         policy = PPO.load(checkpoint_path)
     elif algorithm == 'sac':
         policy = SAC.load(checkpoint_path)
     else:
         raise NotImplementedError
     self._model = policy
def test_sac(ent_coef, i):
    model = SAC(
        "MlpPolicy",
        "Pendulum-v0",
        policy_kwargs=dict(net_arch=[64, 64]),
        learning_starts=3000,
        verbose=1,
        create_eval_env=True,
        buffer_size=10000,
        ent_coef=ent_coef,
        action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)),  #,
        target_update_interval=5000,
        #tensorboard_log="./sac_pendulum_tensorboard/"
    )
    env = model.env
    eval_callback = EvalCallback(env,
                                 best_model_save_path='./logs/',
                                 log_path='./logs/without_target',
                                 eval_freq=250,
                                 deterministic=True,
                                 render=False)
    model.learn(total_timesteps=20000, eval_freq=250)
    """
    definition = 200
    portrait = np.zeros((definition, definition))                                                                       
    state_min = env.observation_space.low                                                                               
    state_max = env.observation_space.high
    for index_t, t in enumerate(np.linspace(-np.pi, np.pi , num=definition)):                               
        for index_td, td in enumerate(np.linspace(state_min[2], state_max[2], num=definition)):                               
            state = torch.Tensor([[np.cos(t), np.sin(t), td]])                                                                            
            action = model.policy.forward(state)
            portrait[definition - (1 + index_td), index_t] = model.critic.q1_forward(state, action)
    plt.figure(figsize=(10, 10))                                                                                        
    plt.imshow(portrait, cmap="inferno", extent=[-180, 180, state_min[2], state_max[2]], aspect='auto')
    plt.rc('axes', titlesize=12) 
    plt.xlabel('angle')
    plt.ylabel('velocity')
    plt.colorbar(label="critic value") 
    plt.scatter([0], [0])
    plt.show()
    #policy = model.policy
    #policy.save("Pendulum-v0#test4SAC#custom#None#{}.zip".format(i))
    #saved_policy = MlpPolicy.load("Pendulum-v0#test4SAC#custom#None#{}.zip".format(i))
    #mean_reward, std_reward = evaluate_policy(saved_policy, model.get_env(), n_eval_episodes=10)
    #print(mean_reward, std_reward)"""
    return model.replay_buffer.rewards
コード例 #22
0
ファイル: subcommand.py プロジェクト: jessecha/airc-rl-agent
def command_demo(args, config):
    agent, callback = _init_agent(args, config, train=False)
    model = SAC.load(args.model_path)
    obs = agent.reset()
    for step in range(args.time_steps):
        if step % 100 == 0: print("step: ", step)
        action, _states = model.predict(obs)
        obs, rewards, dones, info = agent.step(action)
コード例 #23
0
 def load_model(env, algorithm, filename):
     if algorithm == "ddpg":
         return DDPG.load(filename, env=env)
     elif algorithm == "td3":
         return TD3.load(filename, env=env)
     elif algorithm == "sac":
         return SAC.load(filename, env=env)
     else:
         raise Exception("--> Alican's LOG: Unknown agent type!")
コード例 #24
0
def hindsight_experience_replay_example():
    # Hindsight Experience Replay (HER).

    import highway_env

    env = gym.make("parking-v0")

    # Create 4 artificial transitions per real transition.
    n_sampled_goal = 4

    # SAC hyperparams:
    model = SAC(
        "MultiInputPolicy",
        env,
        replay_buffer_class=HerReplayBuffer,
        replay_buffer_kwargs=dict(
            n_sampled_goal=n_sampled_goal,
            goal_selection_strategy="future",
            # IMPORTANT: because the env is not wrapped with a TimeLimit wrapper
            # we have to manually specify the max number of steps per episode.
            max_episode_length=100,
            online_sampling=True,
        ),
        verbose=1,
        buffer_size=int(1e6),
        learning_rate=1e-3,
        gamma=0.95,
        batch_size=256,
        policy_kwargs=dict(net_arch=[256, 256, 256]),
    )

    model.learn(int(2e5))
    model.save("her_sac_highway")

    # Load saved model.
    # Because it needs access to 'env.compute_reward()'
    # HER must be loaded with the env.
    model = SAC.load("her_sac_highway", env=env)

    obs = env.reset()

    # Evaluate the agent.
    episode_reward = 0
    for _ in range(100):
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        env.render()
        episode_reward += reward
        if done or info.get("is_success", False):
            print("Reward:", episode_reward, "Success?",
                  info.get("is_success", False))
            episode_reward = 0.0
            obs = env.reset()
コード例 #25
0
def run(env, algname, filename):
    if algname == "TD3":
        model = TD3.load(f"{algname}_pkl")
    elif algname == "SAC":
        if filename:
            model = SAC.load(f"{filename}")
        else:
            model = SAC.load(f"{algname}_pkl")
    elif algname == "DDPG":
        model = DDPG.load(f"{algname}_pkl")
    else:
        raise "Wrong algorithm name provided."

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
        if done:
            break
コード例 #26
0
def _load_sac(agent, args, config, policy):
    model = None
    if args.load_model == '':
        model = SAC("MlpPolicy",
                    policy_kwargs=policy,
                    env=agent,
                    verbose=config.sac_verbose(),
                    batch_size=config.sac_batch_size(),
                    buffer_size=config.sac_buffer_size(),
                    learning_starts=config.sac_learning_starts(),
                    gradient_steps=config.sac_gradient_steps(),
                    train_freq=config.sac_train_freq(),
                    ent_coef=config.sac_ent_coef(),
                    learning_rate=config.sac_learning_rate(),
                    tensorboard_log="tblog",
                    gamma=config.sac_gamma(),
                    tau=config.sac_tau(),
                    use_sde_at_warmup=config.sac_use_sde_at_warmup(),
                    use_sde=config.sac_use_sde(),
                    sde_sample_freq=config.sac_sde_sample_freq(),
                    n_episodes_rollout=1)
    else:
        model = SAC.load(args.load_model,
                         env=agent,
                         policy_kwargs=policy,
                         verbose=config.sac_verbose(),
                         batch_size=config.sac_batch_size(),
                         buffer_size=config.sac_buffer_size(),
                         learning_starts=config.sac_learning_starts(),
                         gradient_steps=config.sac_gradient_steps(),
                         train_freq=config.sac_train_freq(),
                         ent_coef=config.sac_ent_coef(),
                         learning_rate=config.sac_learning_rate(),
                         tensorboard_log="tblog",
                         gamma=config.sac_gamma(),
                         tau=config.sac_tau(),
                         use_sde_at_warmup=config.sac_use_sde_at_warmup(),
                         use_sde=config.sac_use_sde(),
                         sde_sample_freq=config.sac_sample_freq(),
                         n_episodes_rollout=1)
    return model
コード例 #27
0
ファイル: rl_race.py プロジェクト: helios57/rl-racing
def train_sac():
    latent_dim = 256
    vae = CVAE(latent_dim)
    vae.load_weights('./vae_256/checkpoint')
    env1 = DonkeyVAEEnv(vae, latent_dim, "Helios1")
    # manual_override=None if you don't want to "help" the Agend with w,a,s,d
    # env1 = DonkeyVAEEnv(vae, latent_dim, "Helios1", manual_override=ManualOverride())
    env1.client.collecting = False
    sac = SAC(env=env1,
              policy=MlpPolicy,
              buffer_size=20000,
              learning_starts=0,
              train_freq=20000,
              batch_size=256,
              verbose=2,
              gradient_steps=100,
              learning_rate=0.0005)
    # uncomment if you want to load a model and retrain it
    sac = sac.load("sac/model_sb3", env=env1)
    # sac = sac.load("sac/model_sb3_lake_36", env=env1)
    # sac = sac.load("sac/model_sb3_lake_36_unscaled", env=env1)
    env1.client.hardReset()
    env1.client.initCar()
    env1.client.reset()
    env1.client.restartScene()
    env1.client.hardReset()
    env1.client.initCar()
    env1.client.reset()
    env1.client.collecting = True
    env1.client.telemetrie = []
    while True:
        observation, index = env1.get_observation()
        action = sac.predict(np.asarray([observation]),
                             deterministic=False)[0][0]
        steering, throttle = action[0], action[1]
        env1.client.send_controls(steering * 0.4, throttle)
        # env1.client.send_controls(steering * 0.7, throttle * 0.8)
        print(
            str(index) + " steering:" + str(action[0]) + " throttle:" +
            str(action[1]) + " speed:" +
            str(env1.client.telemetrie[index].speed))
コード例 #28
0
def main():
    as_gdads = True
    name = "pointmass"
    drop_abs_position = True

    dads_env_fn = envs_fns[name]
    conf: Conf = CONFS[name]

    dict_env = as_dict_env(dads_env_fn())
    dict_env = TimeLimit(dict_env, max_episode_steps=conf.ep_len)
    if drop_abs_position:
        dict_env = DropGoalEnvsAbsoluteLocation(dict_env)
    if as_gdads:
        flat_env = SkillWrapper(env=dict_env, skill_reset_steps=conf.ep_len // 2)
    else:
        flat_obs_content = ["observation", "desired_goal", "achieved_goal"]
        if drop_abs_position:
            flat_obs_content.remove("achieved_goal")  # Because always 0 vector
        flat_env = FlattenObservation(FilterObservation(dict_env, filter_keys=flat_obs_content))

    flat_env = TransformReward(flat_env, f=lambda r: r*conf.reward_scaling)
    flat_env = Monitor(flat_env)

    filename = f"modelsCommandSkills/{name}-gdads{as_gdads}"
    if os.path.exists(filename + ".zip"):
        sac = SAC.load(filename, env=flat_env)
        if as_gdads:
            flat_env.load(filename)
    else:
        sac = SAC("MlpPolicy", env=flat_env, verbose=1, learning_rate=conf.lr,
                  tensorboard_log=f"{filename}-tb", buffer_size=10000)
        train(model=sac, conf=conf, save_fname=filename)
        if as_gdads:
            flat_env.save(filename)

    if as_gdads:
        flat_env.set_sac(sac)
        eval_dict_env(dict_env=dict_env,
                      model=flat_env,
                      ep_len=conf.ep_len)
    show(model=sac, env=flat_env, conf=conf)
コード例 #29
0
def test_sac(ent_coef, i):
    model = SAC(
        "MlpPolicy",
        "Pendulum-v0",
        policy_kwargs=dict(net_arch=[64, 64]),
        learning_starts=5e3,
        verbose=1,
        create_eval_env=True,
        buffer_size=1000000,
        ent_coef=ent_coef,
        action_noise=NormalActionNoise(np.zeros(1), np.zeros(1))  #,
        #tensorboard_log="./sac_pendulum_tensorboard/"
    )
    eval_env = gym.make('Pendulum-v0')
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path='./logs/',
                                 log_path='./logs/',
                                 eval_freq=250,
                                 deterministic=True,
                                 render=False)
    model.learn(total_timesteps=20000, callback=eval_callback)
コード例 #30
0
def train_SAC(env, title="Stand Up Task Learning Curve"):
    print(f"action space shape -1:{env.action_space.shape[-1]}")

    # The noise objects for TD3
    n_actions = env.action_space.shape[-1]
    callback = Logger(log_dir=log_dir)
    timesteps = 20000
    model = SAC('MlpPolicy',
                env,
                learning_rate=0.001,
                learning_starts=10000,
                ent_coef='auto_1.1',
                train_freq=1,
                n_episodes_rollout=-1,
                target_entropy=-21,
                buffer_size=1000000,
                action_noise=None,
                batch_size=64,
                verbose=1,
                policy_kwargs=dict(net_arch=[64, 64]))
    model.learn(total_timesteps=timesteps, callback=callback)

    model.save("SAC_pkl")
    plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, title)
    plt.savefig("{}/learn_curve.png".format(log_dir))
    plt.show()