Esempio n. 1
0
def load_model(env, name = None):
    if name:
        filename = os.path.join(config.MODELDIR, env.name, name)
        if os.path.exists(filename):
            logger.info(f'Loading {name}')
            cont = True
            while cont:
                try:
                    ppo_model = PPO1.load(filename, env=env)
                    cont = False
                except Exception as e:
                    time.sleep(5)
                    print(e)
        else:
            raise Exception(f'\n{filename} not found')
    else:
        logger.info(f'Loading base PPO model')
        cont = True
        while cont:
            try:
                ppo_model = PPO1(get_network_arch(env.name), env=env)
                cont = False
            except Exception as e:
                time.sleep(5)
                print(e)

    return ppo_model
def train():
  # train selfplay agent
  logger.configure(folder=LOGDIR)

  train_env = TankSelfPlayTrainEnv()
  train_env.seed(SEED)

  eval_env = TankSelfPlayEnv()
  eval_env.seed(EVAL_SEED)

  # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
  model = PPO1(MlpPolicy, train_env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
                   optim_stepsize=3e-5, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)

  eval_callback = SelfPlayCallback(eval_env,
    best_model_save_path=LOGDIR,
    log_path=LOGDIR,
    eval_freq=EVAL_FREQ,
    n_eval_episodes=EVAL_EPISODES,
    deterministic=False)

  model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

  model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.

  train_env.close()
  eval_env.close()
Esempio n. 3
0
    def __init__(self):
        """ Do any initial setup here """

        #		self.model = PPO1(CnnPolicy, Env(), timesteps_per_actorbatch=128, clip_param=0.2, entcoeff=0.01,optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear',verbose=1)
        path = pathlib.Path(__file__).resolve().parent
        print("Loading ...", str(path) + '/ppo_save')
        self.model = PPO1.load(str(path) + '/ppo_save')
Esempio n. 4
0
 def __init__(self, env, output_path, timesteps_per_actorbatch,
              clip_param, entcoeff, optim_epochs,
              optim_stepsize, optim_batchsize,
              gamma, lam, schedule,
              verbose, num_timesteps):
     print(
         "Initializing PPO with output_path: {} and Hyper Params [timesteps_per_actorbatch: {},clip_param: {}, "
         "entcoeff: {}, optim_epochs: {}, optim_stepsize: {}, optim_batchsize: {}, gamma: {}, lam: {}, "
         "schedule: {}, verbose: {}, num_timesteps: {}]".format(output_path, timesteps_per_actorbatch,
                                                                clip_param, entcoeff, optim_epochs,
                                                                optim_stepsize, optim_batchsize,
                                                                gamma, lam, schedule,
                                                                verbose, num_timesteps))
     super().__init__(env, output_path,
                      PPO1(policy=MlpPolicy,
                           env=env,
                           gamma=gamma,
                           timesteps_per_actorbatch=timesteps_per_actorbatch,
                           clip_param=clip_param,
                           entcoeff=entcoeff,
                           optim_epochs=optim_epochs,
                           optim_stepsize=optim_stepsize,
                           optim_batchsize=optim_batchsize,
                           lam=lam,
                           schedule=schedule,
                           verbose=verbose),
                      num_timesteps)
Esempio n. 5
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for Robotics environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """

    rank = MPI.COMM_WORLD.Get_rank()
    with mujoco_py.ignore_mujoco_warnings():
        workerseed = seed + 10000 * rank
        set_global_seeds(workerseed)
        env = make_robotics_env(env_id, workerseed, rank=rank)
        print(env.observation_space)

        tblog = "/cvgl2/u/surajn/workspace/tb_logs/ppo1_fetchreach/"
        model = PPO1(MlpPolicy,
                     env,
                     timesteps_per_actorbatch=2048,
                     clip_param=0.2,
                     entcoeff=0.0,
                     optim_epochs=5,
                     optim_stepsize=3e-4,
                     optim_batchsize=256,
                     gamma=0.99,
                     lam=0.95,
                     schedule='linear',
                     tensorboard_log=tblog,
                     verbose=1)
        model.learn(total_timesteps=num_timesteps)
        env.close()
 def __init__(self):
   super(TankSelfPlayEnv, self).__init__()
   self.policy = self
   self.best_model = None
   self.best_model_filename = None
   self.oppo_model = None
   if USE_STRONG_OPP:
     print("Using strong opp as a start point")
     self.oppo_model = PPO1.load("ppo_small_stepsize/best_model.zip")
Esempio n. 7
0
def load_model(env, name):

    filename = os.path.join(config.MODELDIR, env.name, name)
    if os.path.exists(filename):
        logger.info(f'Loading {name}')
        cont = True
        while cont:
            try:
                ppo_model = PPO1.load(filename, env=env)
                cont = False
            except Exception as e:
                time.sleep(5)
                print(e)

    elif name == 'base.zip':
        cont = True
        while cont:
            try:

                rank = MPI.COMM_WORLD.Get_rank()
                if rank == 0:
                    ppo_model = PPO1(get_network_arch(env.name), env=env)
                    logger.info(f'Saving base.zip PPO model...')
                    ppo_model.save(
                        os.path.join(config.MODELDIR, env.name, 'base.zip'))
                else:

                    ppo_model = PPO1.load(os.path.join(config.MODELDIR,
                                                       env.name, 'base.zip'),
                                          env=env)

                cont = False
            except IOError as e:
                sys.exit(f'Permissions not granted on zoo/{env.name}/...')
            except Exception as e:

                print('Waiting for base.zip to be created...', e)
                time.sleep(2)

    else:
        raise Exception(f'\n{filename} not found')

    return ppo_model
Esempio n. 8
0
def main():
    """
    Runs the test
    """
    args = mujoco_arg_parser().parse_args()
    logger.configure()
    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)

    env = make_mujoco_env(args.env, args.seed)
    model = PPO1(MlpPolicy,
                 env,
                 timesteps_per_actorbatch=2048,
                 clip_param=0.2,
                 entcoeff=0.0,
                 optim_epochs=10,
                 optim_stepsize=3e-4,
                 optim_batchsize=64,
                 gamma=0.99,
                 lam=0.95,
                 schedule='linear')
    model.learn(total_timesteps=args.num_timesteps)

    model.save("ppo1")
    # env.close()

    del model  # remove to demonstrate saving and loading
    # env = make_mujoco_env(args.env, args.seed)

    model = PPO1.load("ppo1")
    logger.log("~!!!!!!!!")
    episode_rew = 0
    obs = env.reset()

    while True:
        action, _states = model.predict(obs)
        ob, reward, done, info = env.step(action)
        episode_rew += reward
        env.render()
        if done:
            print(f'episode_rew={episode_rew}')
            episode_rew = 0
            obs = env.reset()
 def reset(self):
   # load model if it's there
   modellist = [f for f in os.listdir(LOGDIR) if f.startswith("history")]
   modellist.sort()
   if len(modellist) > 0:
     filename = os.path.join(LOGDIR, modellist[-1]) # the latest best model
     if filename != self.best_model_filename:
       print("loading model: ", filename)
       self.best_model_filename = filename
       if self.best_model is not None:
         del self.best_model
       self.best_model = PPO1.load(filename, env=self)
   return super(SlimeVolleySelfPlayEnv, self).reset()
    def reset(self):
        # Load model if it's there, else wait (meant to be used in parallel with opponent training)
        # reset() is run multiple times throughout the experiment, not just during callbacks.

        while True:
            opp_modellist = [
                f for f in os.listdir(OPP_LOGDIR) if f.startswith("history")
            ]
            opp_modellist.sort()

            self_modellist = [
                f for f in os.listdir(SELF_LOGDIR) if f.startswith("history")
            ]
            self_modellist.sort()

            # Experiment just started, so no history files
            if len(self_modellist) == 0:
                return super(SlimeVolleyMultiAgentEnv, self).reset()

            # Middle of experiment
            if len(self_modellist) > 0:
                # If num of history files is the same, check opponent's last gen.
                if len(self_modellist) - len(opp_modellist) == 0:
                    opp_filename = opp_modellist[-1]
                    # Opponent's last gen has no change -> Both models still training the same gen
                    if opp_filename == self.opp_model_filename:
                        return super(SlimeVolleyMultiAgentEnv, self).reset()
                    # Opponent's last gen changed -> Opponent model has been waiting -> Load new opp.
                    elif opp_filename != self.opp_model_filename:
                        print("Loading model:", opp_filename)
                        self.opp_model_filename = opp_filename
                        if self.opp_model is not None:
                            del self.opp_model
                        self.opp_model = PPO1.load(os.path.join(
                            OPP_LOGDIR, opp_filename),
                                                   env=self)
                        return super(SlimeVolleyMultiAgentEnv, self).reset()
                # Opponent's finished current gen training, self should continue training.
                elif len(opp_modellist) - len(self_modellist) == 1:
                    print(
                        f"Self: Gen {len(self_modellist)}, Opp: Gen {len(opp_modellist)}. Opponent waiting for self training to complete."
                    )
                    return super(SlimeVolleyMultiAgentEnv, self).reset()
            print(
                f"Self: Gen {len(self_modellist)}, Opp: Gen {len(opp_modellist)}. Waiting for opponent training to complete."
            )
            time.sleep(5)
Esempio n. 11
0
def train(env_id, num_timesteps, seed, algorithm, model_save_file=None, log_dir=None):

    with tf_util.single_threaded_session():
        logger.configure(folder=log_dir, format_strs=['stdout', 'log', 'csv'])

        workerseed = seed + MPI.COMM_WORLD.Get_rank()
        env = make_mujoco_env(env_id, workerseed)

        if algorithm == "TRPO":
            model = TRPO(MlpPolicy, env, seed=workerseed, verbose=1)
        else:
            # Algorithm is PPO
            model = PPO1(MlpPolicy, env, seed=workerseed, verbose=1)

        model.learn(total_timesteps=num_timesteps)

        if model_save_file is not None:
            model.save(model_save_file)

        env.close()
Esempio n. 12
0
def train():
  # train selfplay agent
  logger.configure(folder=LOGDIR)

  env = SlimeVolleySelfPlayEnv()
  env.seed(SEED)

  model = PPO1.load(BEST_MODEL_PATH, env=env)

  eval_callback = SelfPlayCallback(env,
    best_model_save_path=LOGDIR,
    log_path=LOGDIR,
    eval_freq=EVAL_FREQ,
    n_eval_episodes=EVAL_EPISODES,
    deterministic=False)

  model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

  model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.

  env.close()
Esempio n. 13
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for Atari environments, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    model = PPO1(CnnPolicy,
                 env,
                 timesteps_per_actorbatch=256,
                 clip_param=0.2,
                 entcoeff=0.01,
                 optim_epochs=4,
                 optim_stepsize=1e-3,
                 optim_batchsize=64,
                 gamma=0.99,
                 lam=0.95,
                 schedule='linear',
                 verbose=2)
    model.learn(total_timesteps=num_timesteps)
    env.close()
Esempio n. 14
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for the Mujoco environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    env = make_mujoco_env(env_id, seed)
    model = PPO1(MlpPolicy,
                 env,
                 timesteps_per_actorbatch=2048,
                 clip_param=0.2,
                 entcoeff=0.0,
                 optim_epochs=10,
                 optim_stepsize=3e-4,
                 optim_batchsize=64,
                 gamma=0.99,
                 lam=0.95,
                 schedule='linear')
    model.learn(total_timesteps=num_timesteps)
    env.close()
def train(num_timesteps, seed, model_path=None):
    """
    Train PPO1 model for the Humanoid environment, for testing purposes

    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param model_path: (str) path to the model
    """
    env_id = 'Humanoid-v2'

    env = gym.make("RoboschoolHumanoid-v1")

    # parameters below were the best found in a simple random search
    # these are good enough to make humanoid walk, but whether those are
    # an absolute best or not is not certain
    env = RewScale(env, 0.1)
    model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
                 optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear')
    model.learn(total_timesteps=num_timesteps)
    env.close()
    if model_path:
        tf_util.save_state(model_path)

    return model
Esempio n. 16
0
def main(args):

    rank = MPI.COMM_WORLD.Get_rank()

    model_dir = os.path.join(config.MODELDIR, args.env_name)

    if rank == 0:
        try:
            os.makedirs(model_dir)
        except:
            pass
        if args.reset:
            reset_files(model_dir)
        logger.configure(config.LOGDIR)
    else:
        logger.configure(format_strs=[])

    if args.debug:
        logger.set_level(config.DEBUG)
    else:
        time.sleep(5)
        logger.set_level(config.INFO)

    workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)

    logger.info('\nSetting up the selfplay training environment opponents...')
    base_env = get_environment(args.env_name)
    env = selfplay_wrapper(base_env)(opponent_type=args.opponent_type,
                                     verbose=args.verbose)
    env.seed(workerseed)

    CustomPolicy = get_network_arch(args.env_name)

    params = {
        'gamma': args.gamma,
        'timesteps_per_actorbatch': args.timesteps_per_actorbatch,
        'clip_param': args.clip_param,
        'entcoeff': args.entcoeff,
        'optim_epochs': args.optim_epochs,
        'optim_stepsize': args.optim_stepsize,
        'optim_batchsize': args.optim_batchsize,
        'lam': args.lam,
        'adam_epsilon': args.adam_epsilon,
        'schedule': 'linear',
        'verbose': 1,
        'tensorboard_log': config.LOGDIR
    }

    time.sleep(
        5
    )  # allow time for the base model to be saved out when the environment is created

    if args.reset or not os.path.exists(
            os.path.join(model_dir, 'best_model.zip')):
        logger.info('\nLoading the base PPO agent to train...')
        model = PPO1.load(os.path.join(model_dir, 'base.zip'), env, **params)
    else:
        logger.info(
            '\nLoading the best_model.zip PPO agent to continue training...')
        model = PPO1.load(os.path.join(model_dir, 'best_model.zip'), env,
                          **params)

    #Callbacks
    logger.info(
        '\nSetting up the selfplay evaluation environment opponents...')
    callback_args = {
        'eval_env':
        selfplay_wrapper(base_env)(opponent_type=args.opponent_type,
                                   verbose=args.verbose),
        'best_model_save_path':
        config.TMPMODELDIR,
        'log_path':
        config.LOGDIR,
        'eval_freq':
        args.eval_freq,
        'n_eval_episodes':
        args.n_eval_episodes,
        'deterministic':
        False,
        'render':
        True,
        'verbose':
        0
    }

    if args.rules:
        logger.info(
            '\nSetting up the evaluation environment against the rules-based agent...'
        )
        # Evaluate against a 'rules' agent as well
        eval_actual_callback = EvalCallback(
            eval_env=selfplay_wrapper(base_env)(opponent_type='rules',
                                                verbose=args.verbose),
            eval_freq=1,
            n_eval_episodes=args.n_eval_episodes,
            deterministic=args.best,
            render=True,
            verbose=0)
        callback_args['callback_on_new_best'] = eval_actual_callback

    # Evaluate the agent against previous versions
    eval_callback = SelfPlayCallback(args.opponent_type, args.threshold,
                                     args.env_name, **callback_args)

    logger.info('\nSetup complete - commencing learning...\n')

    model.learn(total_timesteps=int(1e9),
                callback=[eval_callback],
                reset_num_timesteps=False,
                tb_log_name="tb")

    env.close()
    del env
import pytest

from stable_baselines.a2c import A2C
from stable_baselines.ppo1 import PPO1
from stable_baselines.ppo2 import PPO2
from stable_baselines.trpo_mpi import TRPO
from stable_baselines.common.identity_env import IdentityEnvMultiBinary, IdentityEnvMultiDiscrete
from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines.common.policies import MlpPolicy

MODEL_FUNC_LIST = [
    lambda e: A2C(policy=MlpPolicy, env=e),
    lambda e: PPO1(policy=MlpPolicy, env=e),
    lambda e: PPO2(policy=MlpPolicy, env=e),
    lambda e: TRPO(policy=MlpPolicy, env=e),
]


@pytest.mark.slow
@pytest.mark.parametrize("model_func", MODEL_FUNC_LIST)
def test_identity_multidiscrete(model_func):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    with a multidiscrete action space

    :param model_func: (lambda (Gym Environment): BaseRLModel) the model generator
    """
    env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)])

    model = model_func(env)
  
  RENDER_MODE = False

  SELF_LOGDIR = "exp/multi/ppo-dnn-mujoco"
  OPP_LOGDIR = "exp/multi/ppo-bnn-mujoco"
  logger.configure(folder=SELF_LOGDIR)

  env = SlimeVolleyMultiAgentEnv()
  env.seed(SEED)

  # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
  model = PPO1(MlpPolicy, env,
        timesteps_per_actorbatch=4096,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear', verbose=2)

  eval_callback = MultiAgentCallback(env,
    best_model_save_path=SELF_LOGDIR,
    log_path=SELF_LOGDIR,
    eval_freq=EVAL_FREQ,
    n_eval_episodes=EVAL_EPISODES,
    deterministic=False)

  model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

  model.save(os.path.join(SELF_LOGDIR, "final_model"))
Esempio n. 19
0
learn_func_list = [
    lambda e: A2C(
        policy=MlpPolicy, learning_rate=1e-3, n_steps=1, gamma=0.7, env=e).
    learn(total_timesteps=10000, seed=0),
    lambda e: ACER(policy=MlpPolicy, env=e, n_steps=1, replay_ratio=1).learn(
        total_timesteps=10000, seed=0),
    lambda e: ACKTR(policy=MlpPolicy, env=e, learning_rate=5e-4, n_steps=1
                    ).learn(total_timesteps=20000, seed=0),
    lambda e: DeepQ(policy=deepq_models.mlp([32]),
                    batch_size=16,
                    gamma=0.1,
                    exploration_fraction=0.001,
                    env=e).learn(total_timesteps=40000, seed=0),
    lambda e: PPO1(policy=MlpPolicy,
                   env=e,
                   lam=0.7,
                   optim_batchsize=16,
                   optim_stepsize=1e-3).learn(total_timesteps=10000, seed=0),
    lambda e: PPO2(policy=MlpPolicy, env=e, learning_rate=1.5e-3, lam=0.8
                   ).learn(total_timesteps=20000, seed=0),
    lambda e: TRPO(policy=MlpPolicy, env=e, max_kl=0.05, lam=0.7).learn(
        total_timesteps=10000, seed=0),
]


@pytest.mark.slow
@pytest.mark.parametrize("learn_func", learn_func_list)
def test_identity(learn_func):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
Esempio n. 20
0
LOGDIR = args.logdir
timesteps_per_actorbatch = args.batchnum
optim_stepsize = args.optim_stepsize
gamma = args.gamma

print("***********")
print("Logging to " + LOGDIR)

logger.configure(folder=LOGDIR)

train_env = gym.make("TankGymTrain-v0")
train_env.seed(SEED)
train_env.policy = tankgym.BaselineRandWAim()

eval_env = gym.make("TankGym-v0")
eval_env.seed(EVAL_SEED)
eval_env.policy = tankgym.BaselineRandWAim()

# take mujoco hyperparams (but 2x timesteps_per_actorbatch to cover more steps.)
model = PPO1(MlpPolicy, train_env, timesteps_per_actorbatch=timesteps_per_actorbatch, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
                 optim_stepsize=optim_stepsize, optim_batchsize=64, gamma=gamma, lam=0.95, schedule='linear', verbose=2)

eval_callback = EvalCallback(eval_env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES)

model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.

train_env.close()
eval_env.close()