Ejemplo n.º 1
0
def train():
    """
  Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs.
  """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure(folder=LOGDIR)

    else:
        logger.configure(format_strs=[])
    workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_env(workerseed)

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    model = PPO1.load(BEST_MODEL_PATH, env=env)

    eval_callback = EvalCallback(env,
                                 best_model_save_path=LOGDIR,
                                 log_path=LOGDIR,
                                 eval_freq=EVAL_FREQ,
                                 n_eval_episodes=EVAL_EPISODES)

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    env.close()
    del env
    if rank == 0:
        model.save(os.path.join(
            LOGDIR, "final_model"))  # probably never get to this point.
Ejemplo n.º 2
0
    def train(self, tensorboard_log: str) -> None:

        try:
            self.load_model(tensorboard_log=tensorboard_log)

        except:
            self.create_model(tensorboard_log=tensorboard_log)

        # Stop training if reward gets close to zero
        callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-0.1,
                                                         verbose=1)
        eval_callback = EvalCallback(self.env,
                                     callback_on_new_best=callback_on_best,
                                     verbose=1)

        # Save model at regular time intervals
        checkpoint_callback = CheckpointCallback(
            save_freq=1000, save_path='./model_checkpoints/')

        # Chain callbacks together
        callback = CallbackList([eval_callback, checkpoint_callback])

        # Train model
        self.model.learn(total_timesteps=int(1e10),
                         callback=callback,
                         tb_log_name="run")

        # Save trained model
        print("Training is finished!")
Ejemplo n.º 3
0
def run(model_name, iteration, world, stage):
    world_stage = 'SuperMarioBros-{}-{}-v2'.format(world, stage)
    env = gym_super_mario_bros.make(world_stage)
    env = JoypadSpace(env, RIGHT_ONLY)
    env = WarpFrame(env)
    env = FrameStack(env, n_frames=4)
    env = EpisodicLifeEnv(env)
    # env = MaxAndSkipEnv(env)

    # Save a checkpoint every 1000 steps
    checkpoint_callback = CheckpointCallback(save_freq=5000,
                                             save_path='./logs/',
                                             name_prefix=model_name)

    eval_callback = EvalCallback(env,
                                 best_model_save_path='./logs/',
                                 log_path='./logs/',
                                 eval_freq=10000,
                                 deterministic=True,
                                 render=False)

    print("Compiling model...")
    steps = 10000

    if iteration > 0:
        model = DQN.load('models/{}'.format(model_name),
                         env=env,
                         verbose=1,
                         learning_starts=2500,
                         learning_rate=1e-4,
                         exploration_final_eps=0.01,
                         prioritized_replay=True,
                         prioritized_replay_alpha=0.6,
                         train_freq=4,
                         tensorboard_log="./mario_tensorboard/")
    else:
        model = DQN(CnnPolicy,
                    env,
                    verbose=1,
                    learning_starts=2500,
                    learning_rate=1e-4,
                    exploration_final_eps=0.01,
                    prioritized_replay=True,
                    prioritized_replay_alpha=0.6,
                    train_freq=4,
                    tensorboard_log="./mario_tensorboard/")

    print("Training starting...")
    with ProgressBarManager(steps) as progress_callback:
        model.learn(
            total_timesteps=steps,
            # , eval_callback, checkpoint_callback],
            callback=[progress_callback],
            tb_log_name=model_name)
    print("Finished training model on env...\n")
    model.save("models/{}".format(model_name))
def test_callbacks(model_class):

    env_id = 'Pendulum-v0'
    if model_class in [ACER, DQN]:
        env_id = 'CartPole-v1'

    allowed_failures = []
    # Number of training timesteps is too short
    # otherwise, the training would take too long, or would require
    # custom parameter per algorithm
    if model_class in [PPO1, DQN, TRPO]:
        allowed_failures = ['rollout_end']

    # Create RL model
    model = model_class('MlpPolicy', env_id)

    checkpoint_callback = CheckpointCallback(save_freq=500, save_path=LOG_FOLDER)

    # For testing: use the same training env
    eval_env = model.get_env()
    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1)

    eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best,
                                 best_model_save_path=LOG_FOLDER,
                                 log_path=LOG_FOLDER, eval_freq=100)

    # Equivalent to the `checkpoint_callback`
    # but here in an event-driven manner
    checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=LOG_FOLDER,
                                             name_prefix='event')
    event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

    callback = CallbackList([checkpoint_callback, eval_callback, event_callback])

    model.learn(500, callback=callback)
    model.learn(200, callback=None)
    custom_callback = CustomCallback()
    model.learn(200, callback=custom_callback)
    # Check that every called were executed
    custom_callback.validate(allowed_failures=allowed_failures)
    # Transform callback into a callback list automatically
    custom_callback = CustomCallback()
    model.learn(500, callback=[checkpoint_callback, eval_callback, custom_callback])
    # Check that every called were executed
    custom_callback.validate(allowed_failures=allowed_failures)

    # Automatic wrapping, old way of doing callbacks
    model.learn(200, callback=lambda _locals, _globals: True)

    # Cleanup
    if os.path.exists(LOG_FOLDER):
        shutil.rmtree(LOG_FOLDER)
Ejemplo n.º 5
0
def run():
    # folders
    log_dir = "./logs/"
    models_path = "./trained_models/"
    best_model_save_path = models_path + "ppo_sokoban_model"

    # hyperparameters
    gamma = 0.99 #Discount factor
    ent_coef = 0.01 #Entropy coefficient for the loss calculation
    n_envs = 4 # number of environments
    n_steps = 20 # The number of steps to run for each environment per update (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
    learning_rate = 0.00025 #The learning rate, it can be a function
    vf_coef = 0.5 #Value function coefficient for the loss calculation
    max_grad_norm = 0.5 #The maximum value for the gradient clipping
    lam = 0.95 #Factor for trade-off of bias vs variance for Generalized Advantage Estimator
    timesteps = 100
    verbose = 1

    n_measurements = 10 # number of measurements for the graph
    eval_callback_freq = 20#timesteps / n_measurements # interval between callbacks to achieve desired n_measurements

    # multiprocess environment
    env = make_vec_env('Boxoban-Train-v1', n_envs=n_envs)
    first_env = env.envs[0]
    first_env = Monitor(first_env, log_dir)
    eval_callback = EvalCallback(first_env, best_model_save_path=best_model_save_path,
                                    log_path=log_dir, eval_freq=eval_callback_freq, 
                                    deterministic=True, render=False)

    model = PPO2(MlpPolicy, env,
        gamma=gamma,
        ent_coef=ent_coef,
        n_steps=n_steps,
        learning_rate=learning_rate,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        lam=lam,
        verbose=1)
    model.learn(total_timesteps=timesteps, callback=eval_callback)
    #model.save("trained_models/ppo2_sokoban_model") # save model to disk

    # Enjoy trained agent

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        if rewards[0] > 10:
            print("Completed the puzzle")
        time.sleep(0.1)
        env.render("human")
Ejemplo n.º 6
0
 def init_env(env_id):
     if parallel:
         env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
         reward_env = SubprocVecEnv([make_env(env_id, i) for i in range(1)])
     else:
         env = DummyVecEnv([make_env(env_id, i) for i in range(num_cpu)])
         reward_env = DummyVecEnv([make_env(env_id, i) for i in range(1)])
     if terminate_early:
         callback_on_best = StopTrainingOnRewardThreshold(
             reward_threshold=0.85, verbose=verbose)
         eval_callback = EvalCallback(reward_env,
                                      callback_on_new_best=callback_on_best,
                                      eval_freq=10_000,
                                      verbose=verbose)
         return env, reward_env, eval_callback
     else:
         return env, reward_env, None
Ejemplo n.º 7
0
    def train(self):

        # Load latest model if available
        try:
            path = os.getcwd()
            os.chdir(os.getcwd() + '/model_checkpoints')
            files = [x for x in os.listdir() if x.endswith(".zip")]
            num = []
            for file in files:
                num.append([int(x) for x in file.split('_') if x.isdigit()][0])
            filename = "rl_model_" + str(max(num)) + "_steps.zip"
            print("Tentative: " + filename)
            self.model = PPO2.load(load_path=filename, env=DummyVecEnv([lambda: self.env]), tensorboard_log='./a2c_rasp_tensorboard/')
            print("Successfully loaded the previous model: " + filename)
            os.chdir(path)
        except:
            # Vector-encode our new environment
            env = DummyVecEnv([lambda: self.env])
            # Create new model
            self.model = PPO2('MlpPolicy', env, verbose=1, tensorboard_log='./a2c_rasp_tensorboard/')
            print("Successfully created new model")

        # Stop training if reward get close to zero
        callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1e-2, verbose=1)
        eval_callback = EvalCallback(self.env, callback_on_new_best=callback_on_best, verbose=1)

        # Save model at regular time intervals
        checkpoint_callback = CheckpointCallback(save_freq=2000, save_path='./model_checkpoints/')

        # Chain callbacks together
        callback = CallbackList([eval_callback, checkpoint_callback])

        # Train model
        episode = 1
        while episode < 10:
            # Update location of red dot
            _ = self.env.square
            if self.env.trainable:
                print("Beginning episode number {}".format(episode))
                self.model.learn(total_timesteps=int(1e10), callback=callback, tb_log_name="run")
                episode += 1

        # Save trained model
        self.model.save("raspberry_agent")
Ejemplo n.º 8
0
def train_models(env, vecenv):
    seeds = [1, 2, 3]

    for seed in seeds:
        algos = [{'name': 'a2c', 'model': a2c(vecenv, seed)},
                 {'name': 'acktr', 'model': acktr(vecenv, seed)},
                 {'name': 'ddpg', 'model': ddpg(env, seed)},
                 {'name': 'ppo', 'model': ppo(vecenv, seed)},
                 {'name': 'sac', 'model': sac(env, seed)},
                 {'name': 'td3', 'model': td3(env, seed)},
                 {'name': 'trpo', 'model': trpo(env, seed)}]

        for a in algos:
            cb = EarlyStopCallback(reward_threshold=5000, verbose=1)
            early_stop = EvalCallback(env, callback_on_new_best=cb, verbose=1)

            a['model'].learn(total_timesteps=int(1e10), callback=early_stop)
            a['model'].save(f'data/models/{a["name"]}_{seed}')
            tf.reset_default_graph()
Ejemplo n.º 9
0
def create_eval_callback(env_id,
                         save_dir='./logs',
                         eval_freq=1000,
                         n_eval_episodes=10):
    """
    :param env_id: environment id
    :param save_dir: the directory to save the best model
    :param eval_freq: the frequency of the evaluation callback
    :param n_eval_episodes: the number  of evaluation of each callback
    :return: EvalCallback for training
    """
    eval_env = gym.make(env_id)
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=save_dir,
                                 log_path=save_dir,
                                 eval_freq=eval_freq,
                                 n_eval_episodes=n_eval_episodes,
                                 deterministic=False,
                                 render=False)
    return eval_callback
Ejemplo n.º 10
0
    def create_callbacks(self, eval_env: VecEnv) -> List[BaseCallback]:
        callbacks: List[BaseCallback] = list()

        if self.use_eval_callback:
            model_path: str = os.path.join('non_learning_io_logs', self.model_name, "")
            eval_callback = EvalCallback(eval_env, best_model_save_path=model_path, log_path=model_path,
                                         eval_freq=2 ** 13, verbose=0, n_eval_episodes=32,
                                         deterministic=True, render=False)
            callbacks.append(eval_callback)

        if self.verbose:
            callbacks.append(
                PbarCallback(
                    tqdm(desc="Training Steps Progress",
                         total=self.total_train_steps,
                         file=sys.stdout),
                    num_envs=self.n_envs
                )
            )

        return callbacks
Ejemplo n.º 11
0
 def build_eval_callback(
     self, eval_freq=10000, reward_threshold=900, log_path=None, eval_episodes=10, eval_env=None,
 ):
     callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=reward_threshold, verbose=1)
     eval_callback = EvalCallback(
         eval_env=eval_env,
         best_model_save_path=log_path,
         log_path=log_path,
         eval_freq=eval_freq,
         deterministic=True,
         render=False,
         n_eval_episodes=eval_episodes,
         callback_on_new_best=callback_on_best,
         verbose=1,
     )
     self.logger.debug(
         "Eval callback called every {} timesteps: stop training when mean reward is above {} in {} episodes".format(
             eval_freq, reward_threshold, eval_episodes
         )
     )
     return eval_callback
Ejemplo n.º 12
0
def test_recurrent_eval_callback():
    env_id = 'Pendulum-v0'

    # Create envs
    env = make_vec_env(env_id, n_envs=4)
    eval_env = make_vec_env(env_id, n_envs=1)

    # Create RL model
    model = PPO2('MlpLstmPolicy', env)

    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1)

    eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best,
                                 best_model_save_path=LOG_FOLDER,
                                 log_path=LOG_FOLDER, eval_freq=100)

    model.learn(500, callback=eval_callback)

    # Cleanup
    if os.path.exists(LOG_FOLDER):
        shutil.rmtree(LOG_FOLDER)
Ejemplo n.º 13
0
def learn(env_name, seed, load_path, save_path, tensorboard_log, total_timesteps, n_cpu):
    save_path = env_name if save_path is None else save_path
    checkpoint_callback = CheckpointCallback(save_freq=2000, save_path=save_path)
    eval_env = make_env(env_name, n_cpu, seed)()
    eval_callback = EvalCallback(eval_env, best_model_save_path=save_path+'/best', log_path=tensorboard_log, eval_freq=1000)
    callback = CallbackList([checkpoint_callback, eval_callback])

    policy = CnnPolicy
    # policy = CnnLstmPolicy
    # policy = CnnLnLstmPolicy
    print(env_name, policy)
    # Run this to enable SubprocVecEnv on Mac OS X.
    # export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
    # see https://github.com/rtomayko/shotgun/issues/69#issuecomment-338401331
    env = SubprocVecEnv([make_env(env_name, i, seed) for i in range(n_cpu)])
    if load_path is not None:
        model = PPO2.load(load_path, env, verbose=1, tensorboard_log=tensorboard_log)
    else:
        model = PPO2(policy, env, verbose=1, tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=total_timesteps, log_interval=5, callback=callback)
    print('saving model:', save_path+'/latest_model')
    model.save(save_path+'/latest_model')
    env.close()
Ejemplo n.º 14
0
def train(agent=None):
    weights = {'fr': 0.3, 'fl': 20, 'fk': 20}
    depth, width, move_dist, plan_dist = 3, 3, 3, 3
    mkenv = lambda: Env(depth, width, move_dist, plan_dist,
                        max_steps=20, weights=weights,
                        obstacle_pct=0.1)

    eval_callback = EvalCallback(mkenv(),
                             best_model_save_path='logs/models',
                             log_path='logs', eval_freq=1_000,
                             deterministic=True, render=False)

    vecenv = make_vec_env(mkenv, 32, monitor_dir='logs/training')
    if agent:
        agent.set_env(vecenv)
    else:
        hparams = dict(n_steps=64, nminibatches=64, gamma=0.90,
                       learning_rate=2e-5, ent_coef=0.01,
                       cliprange=0.4, noptepochs=25, lam=0.99)
        agent = PPO2('MlpPolicy', vecenv, verbose=True, **hparams)
    agent.learn(1_000_000, callback=eval_callback)
    agent.save('logs/models/final')
    vecenv.close()
    return agent
Ejemplo n.º 15
0
                        type=int)
    parser.add_argument('--save-freq', help='Save the model every n steps (if negative, no checkpoint)',
                    default=-1, type=int)
    args = parser.parse_args()

    env_id = args.env
    n_timesteps = args.n_timesteps
    save_path = '{}_{}'.format(args.algo, env_id)

    # Instantiate and wrap the environment
    env = TimeFeatureWrapper(gym.make(env_id))

    # Create the evaluation environment and callbacks
    eval_env = DummyVecEnv([lambda: TimeFeatureWrapper(gym.make(env_id))])

    callbacks = [EvalCallback(eval_env, best_model_save_path=save_path)]

    # Save a checkpoint every n steps
    if args.save_freq > 0:
        callbacks.append(CheckpointCallback(save_freq=args.save_freq, save_path=save_path,
                                            name_prefix='rl_model'))

    algo = {
        'sac': SAC,
        'td3': TD3
    }[args.algo]

    n_actions = env.action_space.shape[0]

    # Tuned hyperparameters from https://github.com/araffin/rl-baselines-zoo
    hyperparams = {
def train_initial_policy(
        model_name,
        algo=ALGO,
        env_name=ENV_NAME,
        time_steps=TIME_STEPS):
    """Uses the specified algorithm on the target environment"""
    print("Using algorithm : ", algo.__name__)
    print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl")

    # define the environment here
    env = gym.make(env_name)
    env.seed(SEED)
    if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE)

    if MUJOCO_NORMALIZE:
        env = MujocoNormalized(env)

    print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high)
    print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high)

    if algo.__name__  == "ACKTR":
        print('Using SubprovVecEnv')
        env = SubprocVecEnv([lambda: env for i in range(8)])
    elif algo.__name__ == "SAC":
        print('Using standard gym environment')
        env = env
    else:
        print('Using Dummy Vec Env')
        env = DummyVecEnv([lambda : env])

    if NORMALIZE :
        env = VecNormalize(env,
                           training=True,
                           norm_obs=True,
                           norm_reward=False,
                           clip_reward=1e6,
                           )


    with open('data/target_policy_params.yaml') as file:
        args = yaml.load(file, Loader=yaml.FullLoader)
    args = args[algo.__name__][PARAMS_ENV]
    print('~~ Loaded args file ~~')

    if algo.__name__ == "SAC":
        print('Initializing SAC with RLBaselinesZoo hyperparameters .. ')
        print('using 256 node architecture as in the paper')

        class CustomPolicy(ffp_sac):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[256, 256])

        model = SAC(CustomPolicy, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )
    elif algo.__name__ == "TD3":
        print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml
        n_actions = env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=float(args['noise_std']) * np.ones(n_actions))
        class CustomPolicy2(ffp_td3):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy2, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[400, 300])
        model = TD3(CustomPolicy2, env,
                    verbose = 1,
                    tensorboard_log = 'data/TBlogs/initial_policy_training',
                    batch_size = args['batch_size'],
                    buffer_size = args['buffer_size'],
                    gamma = args['gamma'],
                    gradient_steps = args['gradient_steps'],
                    learning_rate = args['learning_rate'],
                    learning_starts = args['learning_starts'],
                    action_noise = action_noise,
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    elif algo.__name__ == "TRPO":
        print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml
        model = TRPO(mlp_standard, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    timesteps_per_batch=args['timesteps_per_batch'],
                    lam=args['lam'],
                    max_kl=args['max_kl'],
                    gamma=args['gamma'],
                    vf_iters=args['vf_iters'],
                    vf_stepsize=args['vf_stepsize'],
                    entcoeff=args['entcoeff'],
                    cg_damping=args['cg_damping'],
                    cg_iters=args['cg_iters'],
                     seed=SEED,
                    )

    elif algo.__name__ == "ACKTR":
        print('Initializing ACKTR')
        model = ACKTR(mlp_standard,
                      env,
                      verbose=1,
                      n_steps=128,
                      ent_coef=0.01,
                      lr_schedule='constant',
                      learning_rate=0.0217,
                      max_grad_norm=0.5,
                      gamma=0.99,
                      vf_coef=0.946,
                      seed=SEED)

    elif algo.__name__ == "PPO2":
        print('Initializing PPO2')
        print('Num envs : ', env.num_envs)
        model = PPO2(mlp_standard,
                     env,
                     n_steps=int(args['n_steps']/env.num_envs),
                     nminibatches=args['nminibatches'],
                     lam=args['lam'],
                     gamma=args['gamma'],
                     ent_coef=args['ent_coef'],
                     noptepochs=args['noptepochs'],
                     learning_rate=args['learning_rate'],
                     cliprange=args['cliprange'],
                     verbose=1,
                     tensorboard_log='data/TBlogs/initial_policy_training',
                     seed=SEED,
                     )

    else:
        print('No algorithm matched. Using SAC .. ')
        model = SAC(CustomPolicy, env,
                    verbose=1,
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    # change model name if using normalization
    if NORMALIZE:
        model_name = model_name.replace('.pkl', 'normalized_.pkl')

    elif MUJOCO_NORMALIZE:
        model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl')

    if SAVE_BEST_FOR_20:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name,
                    log_interval=10,
                    callback=eval_callback)
        save_the_model()
        model_name = model_name.replace('best_', '')
        model.save(model_name)
    elif SAVE_INTERMEDIATE:
        check_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                            save_path=model_name[:-4],
                                            name_prefix=ENV_NAME + '_' + str(SEED),
                                            verbose=1,
                                            )
        eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)])
        eval_env.seed(SEED)
        eval_callback = EvalCallback(eval_env,
                                     n_eval_episodes=10,
                                     eval_freq=SAVE_FREQ,
                                     log_path=model_name[:-4],
                                     deterministic=False,
                                     render=False,
                                     verbose=1)

        callbacks = CallbackList([check_callback, eval_callback])
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,
                    callback=callbacks)
        model.save(model_name)
        npzfile = np.load(model_name[:-4] + '/evaluations.npz')
        average_rewards = np.mean(npzfile['results'], axis=1)[:, 0]
        with open(model_name[:-4] + "/eval_results.txt", "a") as f:
            for i in range(np.shape(average_rewards)[0]):
                f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i]))
        evaluate_policy_on_env(env, model, render=False, iters=50)
    else:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,)
        model.save(model_name)
        evaluate_policy_on_env(env, model, render=False, iters=50)

    # save the environment params
    if NORMALIZE:
        # env.save(model_name.replace('.pkl', 'stats_.pkl'))
        env.save('data/models/env_stats/'+env_name+'.pkl')

    print('done :: ', model_name)
    exit()
# Separate evaluation env
if SAVE_BEST_FOR_20:
    eval_env = DummyVecEnv([lambda : gym.make(ENV_NAME)])
    if NORMALIZE:
        eval_env = VecNormalize(eval_env,
                                training=True,
                                norm_obs=True,
                                norm_reward=False,
                                clip_reward=1e6,
                                )


    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=model_name[:-4],
                                 n_eval_episodes=30,
                                 eval_freq=5000,
                                 deterministic=True,
                                 render=False,
                                 verbose=1)

    def save_the_model():
        shutil.move(model_name[:-4]+'/best_model.zip', model_name)
        try:
            os.rmdir(model_name[:-4])
            print('Successfully saved the model.')
        except Exception as e:
            print(e)


def evaluate_policy_on_env(env,
                           model,
Ejemplo n.º 18
0
                    action='store',
                    default="gait2d_td3.h5f")
args = parser.parse_args()

# set to get observation in array
#def _new_step(self, action, project=True, obs_as_dict=False):
#    return super(Arm2DEnv, self).step(action, project=project, obs_as_dict=obs_as_dict)
#Arm2DEnv.step = _new_step
# Load walking environment
env = Gait2DGenAct(args.visualize, integrator_accuracy=3e-2)
eval_env = Gait2DGenAct(integrator_accuracy=3e-2)
#env = Arm2DVecEnv(visualize=True)
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=1000,
                                                 verbose=1)
eval_callback = EvalCallback(eval_env,
                             callback_on_new_best=callback_on_best,
                             verbose=1)

n_actions = env.action_space.shape[-1]

param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                            sigma=float(0.1) *
                                            np.ones(n_actions),
                                            theta=0.05)
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.287)


class CustomTD3Policy(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomTD3Policy, self).__init__(*args,
    return _init


if __name__ == '__main__':

    num_cpu = 15  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)])
    eval_env=environment(x, y, z, gamma, cutoffpenaltyscalar, rg_prob, turnspc, savepath)
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    
    #create callbacks to record data, initiate events during training.
    callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=savepath, n_eval_episodes=5
                                                                         , deterministic=False, best_model_save_path=savepath)])
    

    #create model with Stable Baselines package.
    model = A2C(CnnPolicy, env, gamma=gamma, n_steps=updatesteps, learning_rate=LR,  verbose=1)#, tensorboard_log=scenario)
    model.learn(total_timesteps=episodetimesteps**50, callback=callbacklist) #total timesteps set to very large number so program will terminate based on runtime parameter)
    
    
    #create learning curve plot
    evaluations= './%s/%s/evaluations.npz' % (storagefolder,scenario)
    data=np.load(evaluations)
    results=data['results']
    y=np.average(results, axis=1)
    timesteps=data['timesteps']
    plt.plot(timesteps,y)
    
Ejemplo n.º 20
0
def main():
    """ Prepare for trainings """
    log_dir, model_dir = prepare_dirs()

    model_name = model_dir + '/' + MODEL_NAME
    print(f'model will be saved as {model_name}')

    log_dir = log_dir + '/' + MODEL_NAME
    """ Generate & Check environment """
    env_name = ENV_NAME
    env = gym.make(env_name)
    # print(f'Observation space: {env.observation_space}')
    # print(f'Action space: {env.action_space}')
    # env = Monitor(env, log_dir, allow_early_resets=True)
    # check_env(env)
    """ Save config as pickle file """
    config = summarize_config(env)
    save_config(log_dir, config)
    """ Vectorize environment """
    #num_envs = NUM_ENVS
    #env = DummyVecEnv([lambda: env for _ in range(num_envs)])  # For training

    #eval_env = DummyVecEnv([lambda: gym.make(env_name)])  # For evaluation
    eval_env = gym.make(env_name)
    """ Define checkpoint callback """
    checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                             save_path=model_name,
                                             name_prefix=MODEL_NAME)
    """ Use deterministic actions for evaluation callback """
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=model_name,
                                 log_path=log_dir,
                                 eval_freq=EVAL_FREQ,
                                 deterministic=True,
                                 render=False,
                                 n_eval_episodes=N_EVAL_EPISODES)

    print(f'Algorithm: {ALGORITHM}\n')

    if not CONTINUAL_LEARNING:
        """ Define model """
        model = define_model(env, log_dir)
    else:
        model = load_model(env, model_dir, log_dir)
    """ Evaluate model before training """
    # mean_reward, std_reward = evaluate_policy(model=model,
    #                                          env=eval_env,
    #                                          n_eval_episodes=N_EVAL_EPISODES)
    # print(f'Before training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}')
    """ Train model """
    model.learn(total_timesteps=MAX_STEPS,
                callback=[checkpoint_callback, eval_callback])
    """ Evaluate model after training """
    # mean_reward, std_reward = evaluate_policy(model=model,
    #                                          env=eval_env,
    #                                          n_eval_episodes=N_EVAL_EPISODES)
    # print(f'After training: mean reward: {mean_reward:.2f} +/- {std_reward:.2f}')
    """ Save trained model """
    model.save(model_name)
    """ Test trained model """
    obs = eval_env.reset()
    for i in range(N_EVAL_EPISODES):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = eval_env.step(action)
        eval_env.render()

    env.close()
    eval_env.close()

if __name__ == '__main__':

    num_cpu = ncpu  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)])
    eval_env=evalenv(x, y, z, gamma, turnspc, policyname)
    env1 =environment(x, y, z, gamma, turnspc, penaltyscalar, policyname) #env annealreate/ numturns*eval_freq
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    
    #create callbacks to record data, initiate events during training.
    callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=evpath, n_eval_episodes=100, eval_freq=50000
                                                                         , deterministic=False, best_model_save_path=evpath), EvalCallback(env1, log_path=savepath, n_eval_episodes=20, eval_freq=50000
                                                                         , deterministic=False, best_model_save_path=savepath)])
    if (os.path.exists("%s/best_model.zip" % savepath)):
        # Instantiate the agent
        model = A2C(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR,  verbose=1, n_cpu_tf_sess=num_cpu)
        # Load the trained agent
        model = A2C.load("%s/best_model" % savepath, env=env)
        print('loaded agent')
        save_evals()
        
        model.learn(total_timesteps=episodetimesteps**50, callback=callbacklist) #total timesteps set to very large number so program will terminate based on runtime parameter)
        
        
    else:
        #create model with Stable Baselines package.
        model = A2C(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR,  verbose=1, n_cpu_tf_sess=num_cpu)#, tensorboard_log=scenario)
Ejemplo n.º 22
0
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = environment(x,y,z, gamma)
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init


#points_values=list([[0,LR1],[1000000,LR2]])

#Sched=PiecewiseSchedule(points_values, outside_value=LR2)

if __name__ == '__main__':

    num_cpu = 1  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)])
    eval_env=environment(x,y,z, gamma)
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)
    scenario=str(f'RG_t{test}_lr{LR}_gamma{gamma}_batch{batch_size}')    
    callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=scenario, deterministic=False)])
    

        
    model = A2C(CnnPolicy, env, gamma=gamma, verbose=1)#, tensorboard_log=scenario)
    model.learn(total_timesteps=episodetimesteps**99, callback=callbacklist)
from stable_baselines import PPO2
from stable_baselines.common.policies import CnnPolicy
from stable_baselines.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
from BeautifulBlueSquare.BlueGymEnv import simpleAvoidance

# Separate evaluation env
eval_env = simpleAvoidance()

# Stop training when the model reaches the reward threshold, 800 * .9 = 720
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=720,
                                                 verbose=1)

# Create call back that will eval model and save the best one and stop training once reward has reached 490
eval_callback = EvalCallback(eval_env,
                             n_eval_episodes=20,
                             eval_freq=int(800 * 50),
                             callback_on_new_best=callback_on_best,
                             best_model_save_path="model",
                             log_path="model",
                             verbose=1)

# Almost infinite number of timesteps, but the training will stop
# early as soon as the reward threshold is reached
env = simpleAvoidance()

model = PPO2(CnnPolicy, env, gamma=.99, n_steps=256)
model.learn(total_timesteps=int(20e6), callback=eval_callback)
Ejemplo n.º 24
0
    'ledHSVHigher': np.array([31, 9, 255]),
    'rPiIP': '192.168.0.183',
    'rPiPort': 50000,
    'episodeLength': 100,
    'bullseye': 10
}

env = make_vec_env(RPiLEDEnv, n_envs=1, env_kwargs=envArgsDict)

callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-500,
                                                 verbose=1)

eval_callback = EvalCallback(env,
                             best_model_save_path='./logs/best',
                             log_path='./logs/',
                             eval_freq=500,
                             deterministic=True,
                             render=False,
                             callback_on_new_best=callback_on_best)

# Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :(
checkpoint_callback = CheckpointCallback(save_freq=1000,
                                         save_path='./logs/',
                                         name_prefix='ppo2_model')

cb = CallbackList([checkpoint_callback, eval_callback])

policy_kwargs = {'layers': [128, 128]}

model = PPO2(MlpPolicy,
             env,
import gym_env
from stable_baselines.common.callbacks import CallbackList, CheckpointCallback, EvalCallback

checkpoint_callback = CheckpointCallback(save_freq=10000,
                                         save_path='./tf_model_logs/')
# Separate evaluation env
eval_env = gym_env.PegInEnv(
    "PandaPegIn",
    has_offscreen_renderer=True,
    # has_renderer=True,
    use_camera_obs=False,
    control_freq=100,
)

eval_callback = EvalCallback(eval_env,
                             best_model_save_path='./tf_model_logs/best_model',
                             log_path='./tf_model_logs/best_model_results',
                             eval_freq=10000)
# Create the callback list
callback = CallbackList([checkpoint_callback, eval_callback])

env = gym_env.PegInEnv(
    "PandaPegIn",
    has_offscreen_renderer=True,
    # has_renderer=True,
    use_camera_obs=False,
    control_freq=100,
)

model = PPO1(MlpPolicy,
             env,
             timesteps_per_actorbatch=2048,
Ejemplo n.º 26
0
                # env.render()
            delta_returns.append(env.get_attr('final_reward')[0])
        print("naked:", naked_returns)
        print("covered:", covered_returns)
        print("rl:", rl_returns)
        print("delta:", delta_returns)

    else:
        # load data
        df_train, df_test, df_rate = load_data(cfg)
        env = DummyVecEnv([lambda: HedgeEnv(df_train, df_rate, cfg)])
        T = env.get_attr('T')[0]
        checkpoint_callback = CheckpointCallback(save_freq=cfg.timestep / 10,
                                                 save_path=cfg.model_dir)
        eval_callback = EvalCallback(env,
                                     best_model_save_path=cfg.model_dir,
                                     log_path=cfg.log_dir,
                                     eval_freq=cfg.timestep / 10,
                                     deterministic=True,
                                     render=False)
        model = DDPG(MlpPolicy, env, verbose=1)
        model.learn(total_timesteps=cfg.timestep,
                    callback=[checkpoint_callback, eval_callback])
        cfg_log.dump(cfg.cfg_file)

        obs = env.reset()
        for i in range(T):
            action, _states = model.predict(obs)
            obs, rewards, done, info = env.step(action)
            env.render()
Ejemplo n.º 27
0
    log_dir = f"full/{algo}-{policy}-{tag}"
    logger.configure(folder=log_dir)

    env = gym.make("SlimeVolley-v0")
    env.atari_mode = True
    env.survival_bonus = True
    env.__init__()
    env.seed(seed)

    eval_env = gym.make("SlimeVolley-v0")
    eval_env.atari_mode = True
    eval_env.__init__()
    eval_env.seed(seed)
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=log_dir,
                                 log_path=log_dir,
                                 eval_freq=eval_freq,
                                 n_eval_episodes=eval_episodes)

    print(f"Beginning training for {algo}-{policy}-{tag}.")
    params = {
        'policy': policyFn,
        'train_env': env,
        'eval_env': eval_env,
        'timesteps': timesteps,
        'eval_callback': eval_callback,
    }
    opt_params = {
        'a2c': {
            'gamma': [0.900, 0.999],
            'vf_coef': [0.10, 0.40],
Ejemplo n.º 28
0
    logger.configure(folder=LOGDIR)

    env = gym.make("SlimeVolley-v0")
    env = Monitor(env, LOGDIR, allow_early_resets=True)
    env.seed(n)

    model = PPO1(BnnPolicy,
                 env,
                 timesteps_per_actorbatch=4096,
                 clip_param=0.2,
                 entcoeff=0.0,
                 optim_epochs=10,
                 optim_stepsize=3e-4,
                 optim_batchsize=64,
                 gamma=0.99,
                 lam=0.95,
                 schedule='linear',
                 verbose=2)

    eval_callback = EvalCallback(env,
                                 best_model_save_path=LOGDIR,
                                 log_path=LOGDIR,
                                 eval_freq=EVAL_FREQ,
                                 n_eval_episodes=EVAL_EPISODES)

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    model.save(os.path.join(LOGDIR, "final_model"))

    env.close()
    num_cpu = ncpu  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(x, y, z, i) for i in range(num_cpu)])
    eval_env = evalenv(x, y, z, turnspc, policyname)
    env1 = environment(x, y, z, turnspc, scalar, policyname)
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    #create callbacks to record data, initiate events during training.
    callbacklist = CallbackList([
        TimeLimit(episodetimesteps),
        EvalCallback(eval_env,
                     log_path=evpath,
                     n_eval_episodes=100,
                     eval_freq=50000,
                     deterministic=True,
                     best_model_save_path=evpath),
        EvalCallback(env1,
                     log_path=savepath,
                     n_eval_episodes=20,
                     eval_freq=10000,
                     deterministic=False,
                     best_model_save_path=savepath)
    ])
    if (os.path.exists("%s/final_model.zip" % savepath)):
        # Instantiate the agent
        model = ACER(policy,
                     env,
                     gamma=gamma,
                     n_steps=episodetimesteps,
Ejemplo n.º 30
0
	print('done')

	

	a_dim = env.action_space.shape[0]
	# td3_noise = OrnsteinUhlenbeckActionNoise(np.zeros(a_dim), .9*np.ones(a_dim)) 
	td3_noise = NormalActionNoise(0,SIGMA)
	td3_env = DummyVecEnv([lambda: env])
	# td3_env = env
	
	checkpoint_on_event = CheckpointCallback(save_freq=1000, save_path= "./logs/model_checkpoints",
                                         name_prefix='rl_model')
	event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

	eval_callback = EvalCallback(td3_env, best_model_save_path='./logs/',
                             log_path='./logs/', eval_freq=100,
                             deterministic=True, render=False)

	

	# td3_model.learning_starts = 100
	
	custom_callback = customCallback(verbose=0)
	callback = CallbackList([custom_callback, checkpoint_on_event])

	td3_model = TD3(Td3MlpPolicy, td3_env,
					gamma = GAMMA,
					learning_rate = LEARNING_RATE,
					buffer_size = BUFFER_SIZE,
					learning_starts = LEARNING_STARTS,
					train_freq = TRAIN_FREQ,