def load_model(env, name = None): if name: filename = os.path.join(config.MODELDIR, env.name, name) if os.path.exists(filename): logger.info(f'Loading {name}') cont = True while cont: try: ppo_model = PPO1.load(filename, env=env) cont = False except Exception as e: time.sleep(5) print(e) else: raise Exception(f'\n{filename} not found') else: logger.info(f'Loading base PPO model') cont = True while cont: try: ppo_model = PPO1(get_network_arch(env.name), env=env) cont = False except Exception as e: time.sleep(5) print(e) return ppo_model
def train(): # train selfplay agent logger.configure(folder=LOGDIR) train_env = TankSelfPlayTrainEnv() train_env.seed(SEED) eval_env = TankSelfPlayEnv() eval_env.seed(EVAL_SEED) # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.) model = PPO1(MlpPolicy, train_env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-5, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) eval_callback = SelfPlayCallback(eval_env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES, deterministic=False) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point. train_env.close() eval_env.close()
def __init__(self): """ Do any initial setup here """ # self.model = PPO1(CnnPolicy, Env(), timesteps_per_actorbatch=128, clip_param=0.2, entcoeff=0.01,optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear',verbose=1) path = pathlib.Path(__file__).resolve().parent print("Loading ...", str(path) + '/ppo_save') self.model = PPO1.load(str(path) + '/ppo_save')
def __init__(self, env, output_path, timesteps_per_actorbatch, clip_param, entcoeff, optim_epochs, optim_stepsize, optim_batchsize, gamma, lam, schedule, verbose, num_timesteps): print( "Initializing PPO with output_path: {} and Hyper Params [timesteps_per_actorbatch: {},clip_param: {}, " "entcoeff: {}, optim_epochs: {}, optim_stepsize: {}, optim_batchsize: {}, gamma: {}, lam: {}, " "schedule: {}, verbose: {}, num_timesteps: {}]".format(output_path, timesteps_per_actorbatch, clip_param, entcoeff, optim_epochs, optim_stepsize, optim_batchsize, gamma, lam, schedule, verbose, num_timesteps)) super().__init__(env, output_path, PPO1(policy=MlpPolicy, env=env, gamma=gamma, timesteps_per_actorbatch=timesteps_per_actorbatch, clip_param=clip_param, entcoeff=entcoeff, optim_epochs=optim_epochs, optim_stepsize=optim_stepsize, optim_batchsize=optim_batchsize, lam=lam, schedule=schedule, verbose=verbose), num_timesteps)
def train(env_id, num_timesteps, seed): """ Train PPO1 model for Robotics environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() with mujoco_py.ignore_mujoco_warnings(): workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) print(env.observation_space) tblog = "/cvgl2/u/surajn/workspace/tb_logs/ppo1_fetchreach/" model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', tensorboard_log=tblog, verbose=1) model.learn(total_timesteps=num_timesteps) env.close()
def __init__(self): super(TankSelfPlayEnv, self).__init__() self.policy = self self.best_model = None self.best_model_filename = None self.oppo_model = None if USE_STRONG_OPP: print("Using strong opp as a start point") self.oppo_model = PPO1.load("ppo_small_stepsize/best_model.zip")
def load_model(env, name): filename = os.path.join(config.MODELDIR, env.name, name) if os.path.exists(filename): logger.info(f'Loading {name}') cont = True while cont: try: ppo_model = PPO1.load(filename, env=env) cont = False except Exception as e: time.sleep(5) print(e) elif name == 'base.zip': cont = True while cont: try: rank = MPI.COMM_WORLD.Get_rank() if rank == 0: ppo_model = PPO1(get_network_arch(env.name), env=env) logger.info(f'Saving base.zip PPO model...') ppo_model.save( os.path.join(config.MODELDIR, env.name, 'base.zip')) else: ppo_model = PPO1.load(os.path.join(config.MODELDIR, env.name, 'base.zip'), env=env) cont = False except IOError as e: sys.exit(f'Permissions not granted on zoo/{env.name}/...') except Exception as e: print('Waiting for base.zip to be created...', e) time.sleep(2) else: raise Exception(f'\n{filename} not found') return ppo_model
def main(): """ Runs the test """ args = mujoco_arg_parser().parse_args() logger.configure() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) env = make_mujoco_env(args.env, args.seed) model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') model.learn(total_timesteps=args.num_timesteps) model.save("ppo1") # env.close() del model # remove to demonstrate saving and loading # env = make_mujoco_env(args.env, args.seed) model = PPO1.load("ppo1") logger.log("~!!!!!!!!") episode_rew = 0 obs = env.reset() while True: action, _states = model.predict(obs) ob, reward, done, info = env.step(action) episode_rew += reward env.render() if done: print(f'episode_rew={episode_rew}') episode_rew = 0 obs = env.reset()
def reset(self): # load model if it's there modellist = [f for f in os.listdir(LOGDIR) if f.startswith("history")] modellist.sort() if len(modellist) > 0: filename = os.path.join(LOGDIR, modellist[-1]) # the latest best model if filename != self.best_model_filename: print("loading model: ", filename) self.best_model_filename = filename if self.best_model is not None: del self.best_model self.best_model = PPO1.load(filename, env=self) return super(SlimeVolleySelfPlayEnv, self).reset()
def reset(self): # Load model if it's there, else wait (meant to be used in parallel with opponent training) # reset() is run multiple times throughout the experiment, not just during callbacks. while True: opp_modellist = [ f for f in os.listdir(OPP_LOGDIR) if f.startswith("history") ] opp_modellist.sort() self_modellist = [ f for f in os.listdir(SELF_LOGDIR) if f.startswith("history") ] self_modellist.sort() # Experiment just started, so no history files if len(self_modellist) == 0: return super(SlimeVolleyMultiAgentEnv, self).reset() # Middle of experiment if len(self_modellist) > 0: # If num of history files is the same, check opponent's last gen. if len(self_modellist) - len(opp_modellist) == 0: opp_filename = opp_modellist[-1] # Opponent's last gen has no change -> Both models still training the same gen if opp_filename == self.opp_model_filename: return super(SlimeVolleyMultiAgentEnv, self).reset() # Opponent's last gen changed -> Opponent model has been waiting -> Load new opp. elif opp_filename != self.opp_model_filename: print("Loading model:", opp_filename) self.opp_model_filename = opp_filename if self.opp_model is not None: del self.opp_model self.opp_model = PPO1.load(os.path.join( OPP_LOGDIR, opp_filename), env=self) return super(SlimeVolleyMultiAgentEnv, self).reset() # Opponent's finished current gen training, self should continue training. elif len(opp_modellist) - len(self_modellist) == 1: print( f"Self: Gen {len(self_modellist)}, Opp: Gen {len(opp_modellist)}. Opponent waiting for self training to complete." ) return super(SlimeVolleyMultiAgentEnv, self).reset() print( f"Self: Gen {len(self_modellist)}, Opp: Gen {len(opp_modellist)}. Waiting for opponent training to complete." ) time.sleep(5)
def train(env_id, num_timesteps, seed, algorithm, model_save_file=None, log_dir=None): with tf_util.single_threaded_session(): logger.configure(folder=log_dir, format_strs=['stdout', 'log', 'csv']) workerseed = seed + MPI.COMM_WORLD.Get_rank() env = make_mujoco_env(env_id, workerseed) if algorithm == "TRPO": model = TRPO(MlpPolicy, env, seed=workerseed, verbose=1) else: # Algorithm is PPO model = PPO1(MlpPolicy, env, seed=workerseed, verbose=1) model.learn(total_timesteps=num_timesteps) if model_save_file is not None: model.save(model_save_file) env.close()
def train(): # train selfplay agent logger.configure(folder=LOGDIR) env = SlimeVolleySelfPlayEnv() env.seed(SEED) model = PPO1.load(BEST_MODEL_PATH, env=env) eval_callback = SelfPlayCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES, deterministic=False) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point. env.close()
def train(env_id, num_timesteps, seed): """ Train PPO1 model for Atari environments, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) model.learn(total_timesteps=num_timesteps) env.close()
def train(env_id, num_timesteps, seed): """ Train PPO1 model for the Mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ env = make_mujoco_env(env_id, seed) model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') model.learn(total_timesteps=num_timesteps) env.close()
def train(num_timesteps, seed, model_path=None): """ Train PPO1 model for the Humanoid environment, for testing purposes :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param model_path: (str) path to the model """ env_id = 'Humanoid-v2' env = gym.make("RoboschoolHumanoid-v1") # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain env = RewScale(env, 0.1) model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') model.learn(total_timesteps=num_timesteps) env.close() if model_path: tf_util.save_state(model_path) return model
def main(args): rank = MPI.COMM_WORLD.Get_rank() model_dir = os.path.join(config.MODELDIR, args.env_name) if rank == 0: try: os.makedirs(model_dir) except: pass if args.reset: reset_files(model_dir) logger.configure(config.LOGDIR) else: logger.configure(format_strs=[]) if args.debug: logger.set_level(config.DEBUG) else: time.sleep(5) logger.set_level(config.INFO) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) logger.info('\nSetting up the selfplay training environment opponents...') base_env = get_environment(args.env_name) env = selfplay_wrapper(base_env)(opponent_type=args.opponent_type, verbose=args.verbose) env.seed(workerseed) CustomPolicy = get_network_arch(args.env_name) params = { 'gamma': args.gamma, 'timesteps_per_actorbatch': args.timesteps_per_actorbatch, 'clip_param': args.clip_param, 'entcoeff': args.entcoeff, 'optim_epochs': args.optim_epochs, 'optim_stepsize': args.optim_stepsize, 'optim_batchsize': args.optim_batchsize, 'lam': args.lam, 'adam_epsilon': args.adam_epsilon, 'schedule': 'linear', 'verbose': 1, 'tensorboard_log': config.LOGDIR } time.sleep( 5 ) # allow time for the base model to be saved out when the environment is created if args.reset or not os.path.exists( os.path.join(model_dir, 'best_model.zip')): logger.info('\nLoading the base PPO agent to train...') model = PPO1.load(os.path.join(model_dir, 'base.zip'), env, **params) else: logger.info( '\nLoading the best_model.zip PPO agent to continue training...') model = PPO1.load(os.path.join(model_dir, 'best_model.zip'), env, **params) #Callbacks logger.info( '\nSetting up the selfplay evaluation environment opponents...') callback_args = { 'eval_env': selfplay_wrapper(base_env)(opponent_type=args.opponent_type, verbose=args.verbose), 'best_model_save_path': config.TMPMODELDIR, 'log_path': config.LOGDIR, 'eval_freq': args.eval_freq, 'n_eval_episodes': args.n_eval_episodes, 'deterministic': False, 'render': True, 'verbose': 0 } if args.rules: logger.info( '\nSetting up the evaluation environment against the rules-based agent...' ) # Evaluate against a 'rules' agent as well eval_actual_callback = EvalCallback( eval_env=selfplay_wrapper(base_env)(opponent_type='rules', verbose=args.verbose), eval_freq=1, n_eval_episodes=args.n_eval_episodes, deterministic=args.best, render=True, verbose=0) callback_args['callback_on_new_best'] = eval_actual_callback # Evaluate the agent against previous versions eval_callback = SelfPlayCallback(args.opponent_type, args.threshold, args.env_name, **callback_args) logger.info('\nSetup complete - commencing learning...\n') model.learn(total_timesteps=int(1e9), callback=[eval_callback], reset_num_timesteps=False, tb_log_name="tb") env.close() del env
import pytest from stable_baselines.a2c import A2C from stable_baselines.ppo1 import PPO1 from stable_baselines.ppo2 import PPO2 from stable_baselines.trpo_mpi import TRPO from stable_baselines.common.identity_env import IdentityEnvMultiBinary, IdentityEnvMultiDiscrete from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv from stable_baselines.common.policies import MlpPolicy MODEL_FUNC_LIST = [ lambda e: A2C(policy=MlpPolicy, env=e), lambda e: PPO1(policy=MlpPolicy, env=e), lambda e: PPO2(policy=MlpPolicy, env=e), lambda e: TRPO(policy=MlpPolicy, env=e), ] @pytest.mark.slow @pytest.mark.parametrize("model_func", MODEL_FUNC_LIST) def test_identity_multidiscrete(model_func): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) with a multidiscrete action space :param model_func: (lambda (Gym Environment): BaseRLModel) the model generator """ env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)]) model = model_func(env)
RENDER_MODE = False SELF_LOGDIR = "exp/multi/ppo-dnn-mujoco" OPP_LOGDIR = "exp/multi/ppo-bnn-mujoco" logger.configure(folder=SELF_LOGDIR) env = SlimeVolleyMultiAgentEnv() env.seed(SEED) # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.) model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) eval_callback = MultiAgentCallback(env, best_model_save_path=SELF_LOGDIR, log_path=SELF_LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES, deterministic=False) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) model.save(os.path.join(SELF_LOGDIR, "final_model"))
learn_func_list = [ lambda e: A2C( policy=MlpPolicy, learning_rate=1e-3, n_steps=1, gamma=0.7, env=e). learn(total_timesteps=10000, seed=0), lambda e: ACER(policy=MlpPolicy, env=e, n_steps=1, replay_ratio=1).learn( total_timesteps=10000, seed=0), lambda e: ACKTR(policy=MlpPolicy, env=e, learning_rate=5e-4, n_steps=1 ).learn(total_timesteps=20000, seed=0), lambda e: DeepQ(policy=deepq_models.mlp([32]), batch_size=16, gamma=0.1, exploration_fraction=0.001, env=e).learn(total_timesteps=40000, seed=0), lambda e: PPO1(policy=MlpPolicy, env=e, lam=0.7, optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=10000, seed=0), lambda e: PPO2(policy=MlpPolicy, env=e, learning_rate=1.5e-3, lam=0.8 ).learn(total_timesteps=20000, seed=0), lambda e: TRPO(policy=MlpPolicy, env=e, max_kl=0.05, lam=0.7).learn( total_timesteps=10000, seed=0), ] @pytest.mark.slow @pytest.mark.parametrize("learn_func", learn_func_list) def test_identity(learn_func): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action)
LOGDIR = args.logdir timesteps_per_actorbatch = args.batchnum optim_stepsize = args.optim_stepsize gamma = args.gamma print("***********") print("Logging to " + LOGDIR) logger.configure(folder=LOGDIR) train_env = gym.make("TankGymTrain-v0") train_env.seed(SEED) train_env.policy = tankgym.BaselineRandWAim() eval_env = gym.make("TankGym-v0") eval_env.seed(EVAL_SEED) eval_env.policy = tankgym.BaselineRandWAim() # take mujoco hyperparams (but 2x timesteps_per_actorbatch to cover more steps.) model = PPO1(MlpPolicy, train_env, timesteps_per_actorbatch=timesteps_per_actorbatch, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=optim_stepsize, optim_batchsize=64, gamma=gamma, lam=0.95, schedule='linear', verbose=2) eval_callback = EvalCallback(eval_env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point. train_env.close() eval_env.close()