def test_callbacks(tmp_path, model_class):
    log_folder = tmp_path / "logs/callbacks/"

    # DQN only support discrete actions
    env_name = select_env(model_class)
    # Create RL model
    # Small network for fast test
    model = model_class("MlpPolicy",
                        env_name,
                        policy_kwargs=dict(net_arch=[32]))

    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             save_path=log_folder)

    eval_env = gym.make(env_name)
    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200,
                                                     verbose=1)

    eval_callback = EvalCallback(eval_env,
                                 callback_on_new_best=callback_on_best,
                                 best_model_save_path=log_folder,
                                 log_path=log_folder,
                                 eval_freq=100)
    # Equivalent to the `checkpoint_callback`
    # but here in an event-driven manner
    checkpoint_on_event = CheckpointCallback(save_freq=1,
                                             save_path=log_folder,
                                             name_prefix="event")

    event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

    # Stop training if max number of episodes is reached
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100,
                                                      verbose=1)

    callback = CallbackList([
        checkpoint_callback, eval_callback, event_callback,
        callback_max_episodes
    ])
    model.learn(500, callback=callback)

    # Check access to local variables
    assert model.env.observation_space.contains(callback.locals["new_obs"][0])
    # Check that the child callback was called
    assert checkpoint_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert event_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert checkpoint_on_event.locals["new_obs"] is callback.locals["new_obs"]
    # Check that internal callback counters match models' counters
    assert event_callback.num_timesteps == model.num_timesteps
    assert event_callback.n_calls == model.num_timesteps

    model.learn(500, callback=None)
    # Transform callback into a callback list automatically
    model.learn(500, callback=[checkpoint_callback, eval_callback])
    # Automatic wrapping, old way of doing callbacks
    model.learn(500, callback=lambda _locals, _globals: True)

    # Testing models that support multiple envs
    if model_class in [A2C, PPO]:
        max_episodes = 1
        n_envs = 2
        # Pendulum-v0 has a timelimit of 200 timesteps
        max_episode_length = 200
        envs = make_vec_env(env_name, n_envs=n_envs, seed=0)

        model = model_class("MlpPolicy",
                            envs,
                            policy_kwargs=dict(net_arch=[32]))

        callback_max_episodes = StopTrainingOnMaxEpisodes(
            max_episodes=max_episodes, verbose=1)
        callback = CallbackList([callback_max_episodes])
        model.learn(1000, callback=callback)

        # Check that the actual number of episodes and timesteps per env matches the expected one
        episodes_per_env = callback_max_episodes.n_episodes // n_envs
        assert episodes_per_env == max_episodes
        timesteps_per_env = model.num_timesteps // n_envs
        assert timesteps_per_env == max_episode_length

    if os.path.exists(log_folder):
        shutil.rmtree(log_folder)
Exemple #2
0
    tasks.check_name(name)
    save_parameter(save_model_path, args)

    env_builder = importlib.import_module('{}.env_builder'.format(name))
    env = env_builder.build_env(enable_randomizer=True,
                                version=version,
                                enable_rendering=False,
                                control_mode=control_mode)
    eval_env = env_builder.build_env(enable_randomizer=True,
                                     version=version,
                                     enable_rendering=False,
                                     control_mode=control_mode)

    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=save_model_path,
                                 log_path=save_model_path,
                                 eval_freq=1000,
                                 deterministic=True,
                                 render=False)
    policy_kwargs = dict(activation_fn=torch.nn.ReLU, net_arch=net_arch)
    model = SAC('MlpPolicy',
                env,
                verbose=1,
                tensorboard_log=save_model_path,
                policy_kwargs=policy_kwargs,
                buffer_size=buffer_size,
                batch_size=batch_size,
                learning_starts=learning_starts,
                ent_coef=ent_coef)
    if args.load_from_best:
        model = SAC.load(load_model_path)
        model.set_env(env)
# FOR REFERENCE
# policies[0] = fixed_rock_policy,
# policies[1] = fixed_paper_policy,
# policies[2] = fixed_scissors_policy,
# policies[3] = copycat_policy,
# policies[4] = random_policy,
# policies[5] = aggressive_policy,
# policies[6] = passive_policy,

opp_policies = policies
meta_env = gym.make('rps-meta-v0')
test_meta_env = gym.make('rps-meta-v0')


eval_callback = EvalCallback(test_meta_env, eval_freq=1000, deterministic=True, render=False)


policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[30,30])
n_steps, batch_size, n_epochs =  50, 50, 10

model = PPO("MlpPolicy", meta_env, policy_kwargs=policy_kwargs, n_steps=n_steps, batch_size=batch_size, n_epochs=n_epochs, verbose=0)
#model.learn(total_timesteps=100000, callback=eval_callback, meta_learn=False)        # no meta learning
model.learn(total_timesteps=100000, callback=eval_callback, meta_learn=True)     # meta learning

opponent_policies = [
    np.array([0,1,2,0,1]),
    np.array([1,2,2,1,0]),
    np.array([2,1,0,0,0]),
    #np.array([2,2,1,1,0]),
    #np.array([0,1,2,2,2]),
def test_sac_phase():
    reward = []
    for i in [2000, 4000, 6000, 8000, 10000]:
        model = SAC("MlpPolicy",
                    "Pendulum-v0",
                    policy_kwargs=dict(net_arch=[64, 64]),
                    learning_starts=5000,
                    verbose=0,
                    create_eval_env=True,
                    buffer_size=i,
                    ent_coef=0,
                    action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)),
                    batch_size=32)
        env = model.env
        eval_callback = EvalCallback(env,
                                     best_model_save_path='./logs/',
                                     log_path='./logs/alpha5_phase',
                                     eval_freq=250,
                                     n_eval_episodes=100,
                                     deterministic=True,
                                     render=False)
        model.learn(total_timesteps=20000, callback=eval_callback)
        reward.append(eval_callback.last_mean_reward)
        definition = 200
        portrait = np.zeros((definition, definition))
        state_min = env.observation_space.low
        state_max = env.observation_space.high
        for index_t, t in enumerate(np.linspace(-np.pi, np.pi,
                                                num=definition)):
            for index_td, td in enumerate(
                    np.linspace(state_min[2], state_max[2], num=definition)):
                state = torch.Tensor([[np.cos(t), np.sin(t), td]])
                action = model.policy.forward(state)
                portrait[definition - (1 + index_td),
                         index_t] = model.critic.q1_forward(state, action)
        plt.figure(figsize=(10, 10))
        plt.imshow(portrait,
                   cmap="inferno",
                   extent=[-180, 180, state_min[2], state_max[2]],
                   aspect='auto')
        plt.rc('axes', titlesize=12)
        plt.xlabel('angle')
        plt.ylabel('velocity')
        plt.title(
            "critic, last mean reward = {:.2f} +/- {:.2f}, replay size = {}".
            format(reward[-1], eval_callback.last_std, i))
        plt.colorbar(label="critic value")
        plt.scatter([0], [0])
        plt.show()
        definition = 200
        portrait = np.zeros((definition, definition))
        state_min = env.observation_space.low
        state_max = env.observation_space.high
        portrait = np.zeros((definition, definition))
        for index_t, t in enumerate(np.linspace(-np.pi, np.pi,
                                                num=definition)):
            for index_td, td in enumerate(
                    np.linspace(state_min[2], state_max[2], num=definition)):
                state = torch.Tensor([[np.cos(t), np.sin(t), td]])
                probs = model.policy.forward(state)
                action = probs.data.numpy().astype(float)
                portrait[definition - (1 + index_td), index_t] = action
        plt.figure(figsize=(10, 10))
        plt.imshow(portrait,
                   cmap="coolwarm",
                   extent=[-180, 180, state_min[2], state_max[2]],
                   aspect='auto')
        plt.title(
            "action, last mean reward = {:.2f} +/- {:.2f}, replay size = {}".
            format(reward[-1], eval_callback.last_std, i))
        plt.colorbar(label="action")
        plt.rc('axes', titlesize=12)
        plt.xlabel('angle')
        plt.ylabel('velocity')
        plt.scatter([0], [0])
        plt.show()

    return reward
def train_alg(model_alg,
              reset_optimizers,
              buffer_size,
              subsave,
              iteration,
              last_round_no_mer,
              is_evolving,
              gradient_steps=GRADIENT_STEPS,
              params_list=params_list):
    training_timesteps = META_TRAINING_TIMESTEPS
    params = params_list
    if not is_evolving:
        params = [params[-1]]

    start_time = time()
    env = gym.make(env_name)
    eval_env = gym.make(env_name)
    final_eval_env = gym.make(env_name)
    final_parameters_dict = params[-1]
    change_env_parameters(final_eval_env, parameter_dict=final_parameters_dict)
    tensorboard_path = subsave + '/tb_' + str(iteration)

    optimizer_kwargs = {}
    policy_kwargs = {
        'optimizer_class': th.optim.Adam,
        'optimizer_kwargs': optimizer_kwargs,
    }
    model = model_alg(MlpPolicy,
                      env,
                      verbose=1,
                      buffer_size=buffer_size,
                      batch_size=BATCH_SIZE,
                      learning_rate=LEARNING_RATE,
                      learning_starts=LEARNING_STARTS,
                      gradient_steps=gradient_steps,
                      policy_kwargs=policy_kwargs,
                      mer_s=MER_S,
                      mer_gamma=MER_GAMMA,
                      monitor_wrapper=True,
                      tensorboard_log=tensorboard_path)

    # Add memories from all buffers to current one, filling it up entirely in the process
    for replay_buffer in replay_buffers_list:
        model.add_memories_from_another_replay_mem(replay_buffer)

    for i_param, param in enumerate(params):
        log_name = 'run_' + str(i_param)
        if i_param == (len(params) - 1):
            training_timesteps = FINAL_TRAINING_TIMESTEPS
            log_name += '_final'
        change_env_parameters(env, eval_env, parameter_dict=param)
        if model_alg.__name__ == 'SACMER' and last_round_no_mer and (
                i_param == (len(params) - 1)):
            is_reservoir = False
            is_mer = False
        else:  # This will not have any effect on regular SAC
            is_reservoir = True
            is_mer = True
        model.update_env(
            env,
            monitor_wrapper=False,
            is_reservoir=is_reservoir,
            reset_optimizers=reset_optimizers
        )  # environment already wrapped so monitor_wrapper=False
        eval_callback = EvalCallback(eval_env,
                                     best_model_save_path=None,
                                     log_path=tensorboard_path + '/' +
                                     log_name + '/running_eval',
                                     eval_freq=EVAL_FREQ,
                                     n_eval_episodes=N_EVAL_EPISODES,
                                     deterministic=True,
                                     render=False)
        if is_evolving:
            final_eval_callback = EvalCallback(final_eval_env,
                                               best_model_save_path=None,
                                               log_path=tensorboard_path +
                                               '/' + log_name + '/final_eval',
                                               eval_freq=EVAL_FREQ,
                                               n_eval_episodes=N_EVAL_EPISODES,
                                               deterministic=True,
                                               render=False)
        else:
            final_eval_callback = EventCallback()  # empty callback
        model.learn(total_timesteps=training_timesteps,
                    log_interval=1,
                    reset_num_timesteps=False,
                    tb_log_name=log_name,
                    is_mer=is_mer,
                    callback=CallbackList([eval_callback,
                                           final_eval_callback]))
        env.reset()
        eval_env.reset()
    # if iteration == 0:  # saving models fills up storage, so we only save one (which we will also probably not use)
    model.save(subsave + 'model_' + str(iteration), include=['replay_buffer'])
    print(f"Done. Total time = {time() - start_time} seconds.")
Exemple #6
0
from torch import nn

from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback

env = gym.make('HumanoidBasicEnv-v0')
eval_env = gym.make('HumanoidBasicEnv-v0')

policy_kwargs = dict(activation_fn=nn.ReLU, net_arch=[1024, 512])

# model = PPO.load('walking_agent', env=env)
model = PPO('MlpPolicy',
            env,
            policy_kwargs=policy_kwargs,
            verbose=0,
            tensorboard_log='./walk/logs/')

# Save the best model periodically during training
bestModelCallback = EvalCallback(eval_env=eval_env,
                                 eval_freq=10000,
                                 log_path='./walk/logs/',
                                 best_model_save_path='./walk/logs/')

model.learn(total_timesteps=200,
            eval_freq=4000,
            eval_env=eval_env,
            tb_log_name='static_run',
            callback=bestModelCallback)
model.save('static_agent')

env.close()
Exemple #7
0
def evaluate(individual: Individual,
             device: Union[torch.device, str] = "auto") -> Tuple[int]:
    """
    Evaluate a single individual model and return it's mean score after the training time is elapsed.
    Models are trained and evaluated for a number of timestamps as parameterized in the constants at the
    top of the file.
    :param individual: The individual to evaluate.
    :return:
    """

    t_start = time()
    layers = individual.weights
    name = individual.encode()

    checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name)

    if os.path.exists(checkpoint_path):
        return (random.randint(MIN_SCORE, MAX_SCORE), )

    os.makedirs(checkpoint_path, exist_ok=True)
    log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name)
    os.makedirs(log_path, exist_ok=True)

    results_path = os.path.join(checkpoint_path, "results.json")

    if not os.path.exists(results_path):
        env_args = dict(
            frame_skip=4,
            screen_size=84,
            terminal_on_life_loss=True,
            clip_reward=True,
        )

        # Creates a gym environment for an atari game using the specified seed and number of environments
        # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors
        # for improved performance..
        def atari_wrapper(env: gym.Env) -> gym.Env:
            env = AtariWrapper(env, **env_args)
            return env

        def make_env(rank: int, count: int) -> VecEnv:
            return make_vec_env(
                ENV_NAME,
                n_envs=count,
                seed=RANDOM_SEED + rank,
                start_index=0,
                monitor_dir=None,
                wrapper_class=atari_wrapper,
                env_kwargs=None,
                vec_env_cls=SubprocVecEnv,
                vec_env_kwargs=None,
                monitor_kwargs=None,
            )

        train_env = make_env(0, N_ENVS)
        eval_env = make_env(1, 1)

        # required by models in baselines
        train_env = VecTransposeImage(train_env)
        eval_env = VecTransposeImage(eval_env)

        # setup callback to save model at fixed intervals
        save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ,
                                           save_path=checkpoint_path,
                                           name_prefix=name)
        stop_callback = StopTrainingOnRewardThreshold(
            reward_threshold=EVAL_THRESHOLD)
        time_callback = TimeLimitCallback(max_time=TIME_LIMIT)
        best_callback = EvalCallback(
            eval_env,
            eval_freq=EVAL_FREQ,
            best_model_save_path=checkpoint_path,
            callback_on_new_best=stop_callback,
        )
        list_callback = CallbackList(
            [save_callback, best_callback, time_callback])

        model = PPO(
            CnnPolicy,
            train_env,
            verbose=VERBOSE,
            batch_size=BATCH_SIZE,
            seed=RANDOM_SEED * 7,
            tensorboard_log=log_path,
            learning_rate=LEARNING_RATE,
            n_steps=UPDATE_STEPS,
            n_epochs=N_EPOCHS,
            ent_coef=ENT_COEF,
            vf_coef=VF_COEF,
            clip_range=CLIP_RANGE,
            device=device,
            policy_kwargs=dict(features_extractor_class=VariableBenchmark,
                               features_extractor_kwargs=dict(layers=layers)),
        )

        config_path = os.path.join(checkpoint_path, "cnn_config")
        zip_path = os.path.join(checkpoint_path, "model.zip")

        # output the model config to a file for easier viewing
        with open(config_path, "w") as file:
            file.write(f"{name}\n")
            file.write(str(model.policy.features_extractor.cnn))

        print("Beginning training...")

        model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run")
        model.save(zip_path)

        del train_env
        del eval_env

        time_taken = time() - t_start

        print("Beginning evaluation...")

        # score of the game, standard deviation of multiple runs
        reward_mean, reward_std = evaluate_policy(model, make_env(2, 1))

        with open(results_path, "w") as handle:
            handle.write(json.dumps((reward_mean, reward_std, time_taken)))
    else:
        reward_mean, reward_std, time_taken = json.load(open(
            results_path, "r"))

    reward_mean = abs(MIN_SCORE) + reward_mean
    value = (reward_mean * weighted_time(time_taken), )

    print(f"Evaluated {name} with a score of {value}  in {(time_taken):.2f}s")

    return value
Exemple #8
0

    env = create_env(n_envs)

    # Create test env if needed, do not normalize reward
    eval_env = None
    if args.eval_freq > 0:
        # Account for the number of parallel environments
        args.eval_freq = max(args.eval_freq // n_envs, 1)

        if 'NeckEnv' in env_id:
            # Use the training env as eval env when using the neck
            # because there is only one robot
            # there will be an issue with the reset
            eval_callback = EvalCallback(env, callback_on_new_best=None,
                                         best_model_save_path=save_path,
                                         log_path=save_path, eval_freq=args.eval_freq)
            callbacks.append(eval_callback)
        else:
            # Do not normalize the rewards of the eval env
            old_kwargs = None
            if normalize:
                if len(normalize_kwargs) > 0:
                    old_kwargs = normalize_kwargs.copy()
                    normalize_kwargs['norm_reward'] = False
                else:
                    normalize_kwargs = {'norm_reward': False}

            if args.verbose > 0:
                print("Creating test environment")
# hyperparams
buffer_size = max(total_timesteps // 100, 500)
learning_starts = max(total_timesteps // 1000, 100)
train_freq = 1
target_update_interval = 100
exploration_fraction = (learning_starts + 1000) / total_timesteps

# evaluations parameters
eval_env = gym.make(env_name)
eval_env = Monitor(eval_env)
eval_freq = max(1000, total_timesteps // 20)
eval_log_path = "eval_logs/dqnclippedreg_{}_{}_{}_{}".format(
    env_name, loss_type, seed, time_int)
eval_callback = EvalCallback(eval_env,
                             log_path=eval_log_path,
                             eval_freq=eval_freq,
                             deterministic=True,
                             render=False,
                             n_eval_episodes=25)

if env_name == 'MountainCar-v0':
    buffer_size = 10000  # max(total_timesteps // 100, 500)
    learning_starts = 1000  # max(total_timesteps // 1000, 100)
    learning_rate = 4e-3
    batch_size = 128
    gamma = 0.98
    train_freq = 16
    target_update_interval = 600
    gradient_steps = 8
    exploration_fraction = 0.2  # (learning_starts + 1000)/total_timesteps
    exploration_final_eps = 0.07
Exemple #10
0
# by frank tian, 2021-1-16

from stable_baselines3 import DQN
import gym_flappy_bird
from stable_baselines3.common.callbacks import EvalCallback
import gym

env = gym.make("FlappyBirdFeature-v1")
eval_env = gym.make("FlappyBirdFeature-v1")

eval_callback = EvalCallback(eval_env=eval_env,
                             eval_freq=5000,
                             log_path="logs",
                             best_model_save_path="logs")

model = DQN(policy="MlpPolicy",
            env=env,
            batch_size=32,
            buffer_size=1000000,
            learning_starts=50000,
            tensorboard_log="log")

print(model.policy)

if __name__ == "__main__":
    model.learn(int(1e7), callback=eval_callback)
Exemple #11
0
    target_update_interval=10000,
    learning_starts=200000,
    buffer_size=500000,
    max_grad_norm=10,
    exploration_fraction=0.1,
    exploration_final_eps=0.01,
    device="cuda",
    tensorboard_log="./tb_logs/",
)

# Create an evaluation callback with the same env, called every 10000 iterations
callbacks = []
eval_callback = EvalCallback(
    env,
    callback_on_new_best=None,
    n_eval_episodes=5,
    best_model_save_path=".",
    log_path=".",
    eval_freq=10000,
)
callbacks.append(eval_callback)

kwargs = {}
kwargs["callback"] = callbacks

# Train for a certain number of timesteps
model.learn(total_timesteps=5e5,
            tb_log_name="dqn_airsim_car_run_" + str(time.time()),
            **kwargs)

# Save policy weights
model.save("dqn_airsim_car_policy")
Exemple #12
0
                print(f"Saving VecNormalize to {self.save_path}")
        return True


checkpoint_callback = CheckpointCallback(save_freq=30000,
                                         save_path=logger.output_dir,
                                         name_prefix='rl_model')

savestats_callback = SaveNormalization(save_path=osp.join(
    logger.output_dir,
    "vec_normalization.pkl"))  # If using normalize, must create this callback

eval_callback = EvalCallback(eval_env=eval_env,
                             n_eval_episodes=5,
                             callback_on_new_best=savestats_callback,
                             eval_freq=1000,
                             best_model_save_path=osp.join(
                                 logger.output_dir, "best_model"),
                             log_path=osp.join(logger.output_dir, "results"))

callback = CallbackList([checkpoint_callback, eval_callback])

if custom_params['algo'] == 'sac':
    model = SAC(policy=custom_params['policy'],
                env=env,
                verbose=1,
                **custom_params['sac_parameters'],
                tensorboard_log=logger.output_dir)
elif custom_params['algo'] == 'dqn':
    model = DQN(policy=custom_params['policy'],
                env=env,
Exemple #13
0
            tensorboard_log='results/tb/',
            verbose=1)

# In[12]:

eval_env = gym.make("hover-aviary-v0",
                    aggregate_phy_steps=shared_constants.AGGR_PHY_STEPS,
                    obs=ObservationType.KIN,
                    act=ActionType.RPM)

# In[13]:

EPISODE_REWARD_THRESHOLD = -0
callback_on_best = StopTrainingOnRewardThreshold(
    reward_threshold=EPISODE_REWARD_THRESHOLD, verbose=1)

eval_callback = EvalCallback(eval_env,
                             callback_on_new_best=callback_on_best,
                             verbose=1,
                             best_model_save_path='results/',
                             log_path='results/',
                             eval_freq=int(2000 / os.cpu_count()),
                             deterministic=True,
                             render=False)

# In[ ]:

model.learn(total_timesteps=int(50e6),
            callback=eval_callback,
            log_interval=100)
Exemple #14
0
# FOR REFERENCE
# policies[0] = fixed_rock_policy,
# policies[1] = fixed_paper_policy,
# policies[2] = fixed_scissors_policy,
# policies[3] = copycat_policy,
# policies[4] = random_policy,
# policies[5] = aggressive_policy,
# policies[6] = passive_policy,

opp_policies = policies
multitask_env = gym.make('rps-multitask-v0', opp_policies=opp_policies)
test_multitask_env = gym.make('rps-multitask-v0', opp_policies=opp_policies)


eval_callback = EvalCallback(test_multitask_env, eval_freq=1000, deterministic=True, render=False)


policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[8,8])
n_steps, batch_size, n_epochs =  10, 10, 10

model = PPO("MlpPolicy", multitask_env, policy_kwargs=policy_kwargs, n_steps=n_steps, batch_size=batch_size, n_epochs=n_epochs, verbose=0)
model.learn(total_timesteps=20000, callback=eval_callback)



rewards_fixed_rock = multitask_env.run_sim(policies[0], 50, model, 0)
rewards_fixed_paper = multitask_env.run_sim(policies[1], 50, model, 1)
rewards_fixed_scissors = multitask_env.run_sim(policies[2], 50, model, 2)
rewards_copycat = multitask_env.run_sim(policies[3], 50, model, 3)
rewards_random = multitask_env.run_sim(policies[4], 50, model, 4)
Exemple #15
0
            env,
            verbose=1,
            buffer_size=100_000,
            batch_size=256,
            learning_rate=0.0003,
            learning_starts=1024,
            gamma=0.95,
            ent_coef='auto',
            policy_kwargs=policy_kwargs,
            train_freq=512,
            gradient_steps=-1,
            device="cpu")

eval_callback = EvalCallback(eval_env,
                             best_model_save_path=best_save_path,
                             log_path=log_dir,
                             eval_freq=1024,
                             deterministic=True,
                             render=False)
timesteps = 5_000_000

model.learn(timesteps, callback=eval_callback)

model.save(save_path)

# Evaluate
env.close()
env = BaselinifyWrapper(
    TimeLimit(gym.make("PepperReachCam-v0", gui=True, dense=True),
              max_episode_steps=100))
model = SAC.load(save_path)
obs = env.reset()
Exemple #16
0
        return env

    env = create_env(n_envs)

    # Create test env if needed, do not normalize reward
    eval_env = None
    if args.eval_freq > 0 and not args.optimize_hyperparameters:
        # Account for the number of parallel environments
        args.eval_freq = max(args.eval_freq // n_envs, 1)

        if 'NeckEnv' in env_id:
            # Use the training env as eval env when using the neck
            # because there is only one robot
            # there will be an issue with the reset
            eval_callback = EvalCallback(env, callback_on_new_best=None,
                                         best_model_save_path=save_path,
                                         log_path=save_path, eval_freq=args.eval_freq)
            callbacks.append(eval_callback)
        else:
            if args.verbose > 0:
                print("Creating test environment")

            # save_vec_normalize = SaveVecNormalizeCallback(save_freq=1, save_path=params_path)
            # eval_callback = EvalCallback(create_env(1, eval_env=True), callback_on_new_best=save_vec_normalize,
            #                              best_model_save_path=save_path, n_eval_episodes=args.eval_episodes,
            #                              log_path=save_path, eval_freq=args.eval_freq,
            #                              deterministic=not is_atari)
            # save_vec_normalize = SaveVecNormalizeCallback(save_freq=1, save_path=params_path)
            eval_callback = EvalCallback(env,
                                         best_model_save_path=save_path, n_eval_episodes=args.eval_episodes,
                                         log_path=save_path, eval_freq=args.eval_freq,
Exemple #17
0
                                          rew_threshold=13.5,
                                          task_mode=params['task_mode'],
                                          verbose=1)
    eval_env = Monitor(FlatlandEnv(task_manager,
                                   PATHS.get('robot_setting'),
                                   PATHS.get('robot_as'),
                                   params['reward_fnc'],
                                   params['discrete_action_space'],
                                   goal_radius=1.00,
                                   max_steps_per_episode=350),
                       PATHS.get('eval'),
                       info_keywords=("done_reason", ))
    eval_cb = EvalCallback(eval_env,
                           n_eval_episodes=20,
                           eval_freq=10000,
                           log_path=PATHS.get('eval'),
                           best_model_save_path=PATHS.get('model'),
                           deterministic=True,
                           callback_on_new_best=trainstage_cb)

    # determine mode
    if args.custom_mlp:
        # custom mlp flag
        model = PPO("MlpPolicy",
                    env,
                    policy_kwargs=dict(net_arch=args.net_arch,
                                       activation_fn=get_act_fn(args.act_fn)),
                    gamma=gamma,
                    n_steps=n_steps,
                    ent_coef=ent_coef,
                    learning_rate=learning_rate,
                                    seed=0)
        if env_name == "tune-aviary-v0":
            eval_env = make_vec_env(TuneAviary,
                                    env_kwargs=sa_env_kwargs,
                                    n_envs=1,
                                    seed=0)
        eval_env = VecTransposeImage(eval_env)

    #### Train the model #######################################
    # checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=filename+'-logs/', name_prefix='rl_model')
    callback_on_best = StopTrainingOnRewardThreshold(
        reward_threshold=EPISODE_REWARD_THRESHOLD, verbose=1)
    eval_callback = EvalCallback(eval_env,
                                 callback_on_new_best=callback_on_best,
                                 verbose=1,
                                 best_model_save_path=filename + '/',
                                 log_path=filename + '/',
                                 eval_freq=int(2000 / ARGS.cpu),
                                 deterministic=True,
                                 render=False)
    model.learn(
        total_timesteps=35000,  #int(1e12),
        callback=eval_callback,
        log_interval=100,
    )

    #### Save the model ########################################
    model.save(filename + '/success_model.zip')
    print(filename)

    #### Print training progression ############################
    with np.load(filename + '/evaluations.npz') as data:
def train_alg(model_alg, reset_optimizers_between_envs, reset_optimizers_every_iter, buffer_size, subsave, iteration,
              last_round_no_mer, is_evolving, seed):
    seed_all(seed)
    training_timesteps = META_TRAINING_TIMESTEPS
    params = params_list
    if not is_evolving:
        params = [params[-1]]

    start_time = time()
    env = gym.make(env_name)
    eval_env = gym.make(env_name)
    final_eval_env = gym.make(env_name)
    final_parameters_dict = params_sampler.sample1_means()
    change_env_parameters(final_eval_env, parameter_dict=final_parameters_dict)
    tensorboard_path = subsave + '/tb_' + str(iteration)

    optimizer_kwargs = {}
    policy_kwargs = {
        'optimizer_class': th.optim.Adam,
        'optimizer_kwargs': optimizer_kwargs,
    }
    model = model_alg(MlpPolicy, env, verbose=0, buffer_size=buffer_size, batch_size=BATCH_SIZE,
                      learning_rate=LEARNING_RATE,
                      learning_starts=LEARNING_STARTS,
                      gradient_steps=GRADIENT_STEPS, policy_kwargs=policy_kwargs, mer_s=MER_S, mer_gamma=MER_GAMMA,
                      monitor_wrapper=True,
                      tensorboard_log=tensorboard_path,
                      reset_optimizers_during_training=reset_optimizers_every_iter,
                      seed=seed
                      )

    for i_param, param in enumerate(params):
        log_name = 'run_' + str(i_param)
        if i_param == (len(params) - 1):
            if not is_evolving:
                training_timesteps = FINAL_TRAINING_TIMESTEPS + NUM_TRAINING_ENVS * META_TRAINING_TIMESTEPS
            else:
                training_timesteps = FINAL_TRAINING_TIMESTEPS
            log_name += '_final'
        change_env_parameters(env, eval_env, parameter_dict=param)
        if model_alg.__name__ == 'DQNMER' and last_round_no_mer and (i_param == (len(params) - 1)):
            is_reservoir = False
            is_mer = False
        else:  # This will not have any effect on regular DQN
            is_reservoir = True
            is_mer = True
        model.update_env(env, monitor_wrapper=False, is_reservoir=is_reservoir,
                         reset_optimizers=reset_optimizers_between_envs)  # environment already wrapped so
        # monitor_wrapper=False
        eval_callback = EvalCallback(eval_env,
                                     best_model_save_path=None,
                                     log_path=tensorboard_path + '/' + log_name + '/running_eval',
                                     eval_freq=EVAL_FREQ,
                                     n_eval_episodes=N_EVAL_EPISODES,
                                     deterministic=True, render=False)
        if is_evolving:
            final_eval_callback = EvalCallback(final_eval_env,
                                               best_model_save_path=None,
                                               log_path=tensorboard_path + '/' + log_name + '/final_eval',
                                               eval_freq=EVAL_FREQ,
                                               n_eval_episodes=N_EVAL_EPISODES,
                                               deterministic=True, render=False)
        else:
            final_eval_callback = EventCallback()
        model.learn(total_timesteps=training_timesteps, log_interval=1, reset_num_timesteps=False,
                    tb_log_name=log_name, is_mer=is_mer, callback=CallbackList([eval_callback, final_eval_callback]))
        env.reset()
        eval_env.reset()
    if iteration == 0:  # saving models fills up storage, so we only save one (which we will also probably not use)
        model.save(subsave + 'model_' + str(iteration))
    print(f"Done. Total time = {time() - start_time} seconds.")
Exemple #20
0
    PATHS = get_paths(AGENT_NAME, args)

    if args.n is None:
        n_timesteps = 6000
    else:
        n_timesteps = args.n

    # instantiate gym environment
    n_envs = 1

    task = get_predefined_task("random")
    env = DummyVecEnv([lambda: FlatlandEnv(task, PATHS.get('robot_setting'), PATHS.get('robot_as'), discrete_action_space)] * n_envs)
   
    # instantiate eval environment
    eval_env = Monitor(FlatlandEnv(task, PATHS.get('robot_setting'), PATHS.get('robot_as'), discrete_action_space), PATHS.get('eval'))
    eval_env = EvalCallback(eval_env, n_eval_episodes=10, eval_freq=250, log_path=PATHS.get('eval'), best_model_save_path=PATHS.get('model'), deterministic=True)


    # determine mode
    if args.custom_mlp:
        # custom mlp flag
        model = PPO("MlpPolicy", env, policy_kwargs = dict(net_arch = args.net_arch, activation_fn = get_act_fn(args.act_fn)), 
                    gamma = gamma, n_steps = n_steps, ent_coef = ent_coef, learning_rate = learning_rate, vf_coef = vf_coef, 
                    max_grad_norm = max_grad_norm, gae_lambda = gae_lambda, batch_size = batch_size, n_epochs = n_epochs, clip_range = clip_range, 
                    tensorboard_log = PATHS.get('tb'), verbose = 1)


    elif args.agent is not None:
        # predefined agent flag
        if args.agent == "MLP_ARENA2D":
Exemple #21
0
def main():
    if(StartFresh):
        # Create Environment
        env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        # Separate evaluation env
        eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
        eval_env.reset()
        # Create Model
        model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log)

    else:
        print('duh')
        # tmp_test_name = 'SAC-Continued'
        # tb_log_name = tmp_test_name + '_' + env_name
        # tmp_log_dir = os.path.join('log', tmp_test_name)
        # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name)
        # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name)
        # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models')
        # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps')
        # # Load Enironment
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # env = VecNormalize.load(tmp_env_stats_path, env)
        # env.reset()
        # # Separate evaluation env
        # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env)
        # eval_env.reset()
        # # Load Model
        # # model = SAC.load(model_stats_path, tensorboard_log=tb_log)
        # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6)
        # # model.learning_rate = 1e-5
        # model.set_env(env)

    if(DoTraining):
        checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path)
        # Use deterministic actions for evaluation
        eval_callback = EvalCallback(eval_env, best_model_save_path=best_path,
                                    log_path=best_path, eval_freq=eval_freq,
                                    deterministic=True, render=False)
        # Video Update Callback 
        record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1)
        envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path)
        nStep_callback_list = CallbackList([record_callback, envSave_callback])
        vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list)
        
        # Create the callback list
        callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback])

        print(tb_log_name)
        model.learn(total_timesteps=total_timesteps,
            tb_log_name=tb_log_name, 
            reset_num_timesteps=False,
            callback=callbacks) #, callback=callback, =TensorboardCallback()

        # Don't forget to save the VecNormalize statistics when saving the agent
        model.save(model_stats_path)
        env.save(env_stats_path)

    if(DoVideo):
        record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)
def main():
    if(StartFresh):
        # Create Environment
        env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        # Separate evaluation env
        eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)])
        eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
        eval_env.reset()
        # Create Model
        # model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log, device="auto")
        policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[dict(pi=[256, 256], vf=[256, 256])])

        model = PPO('MlpPolicy', 
            env, 
            learning_rate = 3e-5,
            n_steps=512,
            batch_size=128,
            n_epochs=20,
            gamma=0.99,
            gae_lambda = 0.9,
            clip_range = 0.4,
            vf_coef = 0.5,
            use_sde = True,
            sde_sample_freq = 4,
            policy_kwargs = policy_kwargs, 
            verbose=1, 
            tensorboard_log=tb_log,
            device="auto")


    else:
        print('duh')
        # tmp_test_name = 'SAC-Continued'
        # tb_log_name = tmp_test_name + '_' + env_name
        # tmp_log_dir = os.path.join('log', tmp_test_name)
        # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name)
        # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name)
        # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models')
        # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps')
        # # Load Enironment
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # env = VecNormalize.load(tmp_env_stats_path, env)
        # env.reset()
        # # Separate evaluation env
        # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env)
        # eval_env.reset()
        # # Load Model
        # # model = SAC.load(model_stats_path, tensorboard_log=tb_log)
        # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6)
        # # model.learning_rate = 1e-5
        # model.set_env(env)

    if(DoTraining):
        checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path)
        # Use deterministic actions for evaluation
        eval_callback = EvalCallback(eval_env, best_model_save_path=best_path,
                                    log_path=best_path, eval_freq=eval_freq,
                                    deterministic=True, render=False)
        # Video Update Callback 
        record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1)
        envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path)
        nStep_callback_list = CallbackList([record_callback, envSave_callback])
        # nStep_callback_list = CallbackList([envSave_callback])
        vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list)
        
        # Create the callback list
        callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback])
        # callbacks = CallbackList([checkpoint_callback, eval_callback])

        print(tb_log_name)
        model.learn(total_timesteps=total_timesteps,
            tb_log_name=tb_log_name, 
            reset_num_timesteps=False,
            callback=callbacks)

        # Don't forget to save the VecNormalize statistics when saving the agent
        model.save(model_stats_path)
        env.save(env_stats_path)

    if(DoVideo):
        record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)
Exemple #23
0
        default=-1,
        type=int,
    )
    args = parser.parse_args()

    env_id = args.env
    n_timesteps = args.n_timesteps
    save_path = f"{args.algo}_{env_id}"

    # Instantiate and wrap the environment
    env = gym.make(env_id)

    # Create the evaluation environment and callbacks
    eval_env = Monitor(gym.make(env_id))

    callbacks = [EvalCallback(eval_env, best_model_save_path=save_path)]

    # Save a checkpoint every n steps
    if args.save_freq > 0:
        callbacks.append(
            CheckpointCallback(save_freq=args.save_freq,
                               save_path=save_path,
                               name_prefix="rl_model"))

    algo = {
        "sac": SAC,
        "td3": TD3,
    }[args.algo]

    n_actions = env.action_space.shape[0]
Exemple #24
0
from stable_baselines3.common.callbacks import EvalCallback

BUFFER_SIZE = int(1e6)
LEARNING_STARTS = int(1e4)
BATCH_SIZE = 64
ENT_COEF = 0.05

ENV_NAME = 'Walker2DBulletEnv-v0'
TIME_STEPS = 100000

env = gym.make(ENV_NAME)
eval_env = gym.make(ENV_NAME)

eval_callback = EvalCallback(eval_env,
                             best_model_save_path='./logs/',
                             log_path='./logs/',
                             eval_freq=500,
                             deterministic=True,
                             render=False)
model = SAC('MlpPolicy',
            env,
            verbose=1,
            tensorboard_log="./log/",
            buffer_size=BUFFER_SIZE,
            batch_size=BATCH_SIZE,
            learning_starts=LEARNING_STARTS,
            ent_coef=ENT_COEF)

model.learn(total_timesteps=TIME_STEPS, callback=eval_callback)

env.render()
obs = env.reset()
Exemple #25
0
def main(env, args):
    global model
    # Fix random seeds and number of threads
    np.random.seed(args.seed)

    if args.recodex:
        models = []
        for path in args.load_from:
            models.append(SAC.load(path))

        while True:
            state, done = env.reset(start_evaluation=True), False
            ret = 0
            while not done:
                action = np.sum(np.array(
                    list(
                        map(lambda m: m.predict(state, deterministic=True)[0],
                            models))),
                                axis=0) / len(models)**0.5
                # print(action)

                # action, _states = model.predict(state, deterministic=True)
                # action, _states = model.predict(state)

                ## TODO delete before submitting
                if not args.no_render:
                    env.render()

                state, reward, done, _ = env.step(action)
                ret += reward

            print("Episode return:", ret)

    else:

        tensorboard_log_dir = None if args.tensorboard_log_dir is None else os.path.join(
            args.tensorboard_log_dir, get_exp_name())

        model = SAC("MlpPolicy",
                    env,
                    learning_rate=lr_schedule,
                    buffer_size=args.buffer_size,
                    learning_starts=args.learning_starts,
                    n_episodes_rollout=args.train_episodes,
                    batch_size=args.batch_size,
                    tau=args.tau,
                    gamma=args.gamma,
                    train_freq=args.train_freq,
                    gradient_steps=args.gradient_steps,
                    ent_coef="auto"
                    if args.ent_coef == "auto" else float(args.ent_coef),
                    use_sde=False,
                    policy_kwargs=dict(log_std_init=-3,
                                       net_arch=args.net_arch,
                                       use_expln=True),
                    tensorboard_log=tensorboard_log_dir,
                    rew_skip_thres=args.rew_skip_thres,
                    seed=args.seed)

        model.verbose = 2

        callbacks = [
            CheckpointCallback(20000,
                               "checkpoints",
                               name_prefix=get_exp_name()),
            EvalCallback(
                gym.make(getEnvName()),
                callback_on_new_best=SaveBestModelCallback(
                    save_path="best/" + get_exp_name() + "_best_model.zip"),
                eval_freq=20000,
                n_eval_episodes=5,
                deterministic=True),
            EpisodeCallback(env, model)
        ]

        print(args.log_interval)
        model.learn(args.timesteps,
                    log_interval=args.log_interval,
                    callback=callbacks)

        # Final evaluation
        env = wrappers.EvaluationWrapper(gym.make(getEnvName()),
                                         evaluate_for=200,
                                         seed=args.seed)

        while True:
            state, done = env.reset(start_evaluation=True), False
            while not done:
                action, _states = model.predict(state, deterministic=True)
                state, reward, done, _ = env.step(action)

        model.save(get_exp_name())