def __init__(self): """ Do any initial setup here """ # self.model = PPO1(CnnPolicy, Env(), timesteps_per_actorbatch=128, clip_param=0.2, entcoeff=0.01,optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear',verbose=1) path = pathlib.Path(__file__).resolve().parent print("Loading ...", str(path) + '/ppo_save') self.model = PPO1.load(str(path) + '/ppo_save')
def load_model(env, name = None): if name: filename = os.path.join(config.MODELDIR, env.name, name) if os.path.exists(filename): logger.info(f'Loading {name}') cont = True while cont: try: ppo_model = PPO1.load(filename, env=env) cont = False except Exception as e: time.sleep(5) print(e) else: raise Exception(f'\n{filename} not found') else: logger.info(f'Loading base PPO model') cont = True while cont: try: ppo_model = PPO1(get_network_arch(env.name), env=env) cont = False except Exception as e: time.sleep(5) print(e) return ppo_model
def __init__(self): super(TankSelfPlayEnv, self).__init__() self.policy = self self.best_model = None self.best_model_filename = None self.oppo_model = None if USE_STRONG_OPP: print("Using strong opp as a start point") self.oppo_model = PPO1.load("ppo_small_stepsize/best_model.zip")
def load_model(env, name): filename = os.path.join(config.MODELDIR, env.name, name) if os.path.exists(filename): logger.info(f'Loading {name}') cont = True while cont: try: ppo_model = PPO1.load(filename, env=env) cont = False except Exception as e: time.sleep(5) print(e) elif name == 'base.zip': cont = True while cont: try: rank = MPI.COMM_WORLD.Get_rank() if rank == 0: ppo_model = PPO1(get_network_arch(env.name), env=env) logger.info(f'Saving base.zip PPO model...') ppo_model.save( os.path.join(config.MODELDIR, env.name, 'base.zip')) else: ppo_model = PPO1.load(os.path.join(config.MODELDIR, env.name, 'base.zip'), env=env) cont = False except IOError as e: sys.exit(f'Permissions not granted on zoo/{env.name}/...') except Exception as e: print('Waiting for base.zip to be created...', e) time.sleep(2) else: raise Exception(f'\n{filename} not found') return ppo_model
def reset(self): # load model if it's there modellist = [f for f in os.listdir(LOGDIR) if f.startswith("history")] modellist.sort() if len(modellist) > 0: filename = os.path.join(LOGDIR, modellist[-1]) # the latest best model if filename != self.best_model_filename: print("loading model: ", filename) self.best_model_filename = filename if self.best_model is not None: del self.best_model self.best_model = PPO1.load(filename, env=self) return super(SlimeVolleySelfPlayEnv, self).reset()
def reset(self): # Load model if it's there, else wait (meant to be used in parallel with opponent training) # reset() is run multiple times throughout the experiment, not just during callbacks. while True: opp_modellist = [ f for f in os.listdir(OPP_LOGDIR) if f.startswith("history") ] opp_modellist.sort() self_modellist = [ f for f in os.listdir(SELF_LOGDIR) if f.startswith("history") ] self_modellist.sort() # Experiment just started, so no history files if len(self_modellist) == 0: return super(SlimeVolleyMultiAgentEnv, self).reset() # Middle of experiment if len(self_modellist) > 0: # If num of history files is the same, check opponent's last gen. if len(self_modellist) - len(opp_modellist) == 0: opp_filename = opp_modellist[-1] # Opponent's last gen has no change -> Both models still training the same gen if opp_filename == self.opp_model_filename: return super(SlimeVolleyMultiAgentEnv, self).reset() # Opponent's last gen changed -> Opponent model has been waiting -> Load new opp. elif opp_filename != self.opp_model_filename: print("Loading model:", opp_filename) self.opp_model_filename = opp_filename if self.opp_model is not None: del self.opp_model self.opp_model = PPO1.load(os.path.join( OPP_LOGDIR, opp_filename), env=self) return super(SlimeVolleyMultiAgentEnv, self).reset() # Opponent's finished current gen training, self should continue training. elif len(opp_modellist) - len(self_modellist) == 1: print( f"Self: Gen {len(self_modellist)}, Opp: Gen {len(opp_modellist)}. Opponent waiting for self training to complete." ) return super(SlimeVolleyMultiAgentEnv, self).reset() print( f"Self: Gen {len(self_modellist)}, Opp: Gen {len(opp_modellist)}. Waiting for opponent training to complete." ) time.sleep(5)
def main(): """ Runs the test """ args = mujoco_arg_parser().parse_args() logger.configure() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) env = make_mujoco_env(args.env, args.seed) model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') model.learn(total_timesteps=args.num_timesteps) model.save("ppo1") # env.close() del model # remove to demonstrate saving and loading # env = make_mujoco_env(args.env, args.seed) model = PPO1.load("ppo1") logger.log("~!!!!!!!!") episode_rew = 0 obs = env.reset() while True: action, _states = model.predict(obs) ob, reward, done, info = env.step(action) episode_rew += reward env.render() if done: print(f'episode_rew={episode_rew}') episode_rew = 0 obs = env.reset()
def train(): # train selfplay agent logger.configure(folder=LOGDIR) env = SlimeVolleySelfPlayEnv() env.seed(SEED) model = PPO1.load(BEST_MODEL_PATH, env=env) eval_callback = SelfPlayCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES, deterministic=False) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point. env.close()
def main(args): rank = MPI.COMM_WORLD.Get_rank() model_dir = os.path.join(config.MODELDIR, args.env_name) if rank == 0: try: os.makedirs(model_dir) except: pass if args.reset: reset_files(model_dir) logger.configure(config.LOGDIR) else: logger.configure(format_strs=[]) if args.debug: logger.set_level(config.DEBUG) else: time.sleep(5) logger.set_level(config.INFO) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) logger.info('\nSetting up the selfplay training environment opponents...') base_env = get_environment(args.env_name) env = selfplay_wrapper(base_env)(opponent_type=args.opponent_type, verbose=args.verbose) env.seed(workerseed) CustomPolicy = get_network_arch(args.env_name) params = { 'gamma': args.gamma, 'timesteps_per_actorbatch': args.timesteps_per_actorbatch, 'clip_param': args.clip_param, 'entcoeff': args.entcoeff, 'optim_epochs': args.optim_epochs, 'optim_stepsize': args.optim_stepsize, 'optim_batchsize': args.optim_batchsize, 'lam': args.lam, 'adam_epsilon': args.adam_epsilon, 'schedule': 'linear', 'verbose': 1, 'tensorboard_log': config.LOGDIR } time.sleep( 5 ) # allow time for the base model to be saved out when the environment is created if args.reset or not os.path.exists( os.path.join(model_dir, 'best_model.zip')): logger.info('\nLoading the base PPO agent to train...') model = PPO1.load(os.path.join(model_dir, 'base.zip'), env, **params) else: logger.info( '\nLoading the best_model.zip PPO agent to continue training...') model = PPO1.load(os.path.join(model_dir, 'best_model.zip'), env, **params) #Callbacks logger.info( '\nSetting up the selfplay evaluation environment opponents...') callback_args = { 'eval_env': selfplay_wrapper(base_env)(opponent_type=args.opponent_type, verbose=args.verbose), 'best_model_save_path': config.TMPMODELDIR, 'log_path': config.LOGDIR, 'eval_freq': args.eval_freq, 'n_eval_episodes': args.n_eval_episodes, 'deterministic': False, 'render': True, 'verbose': 0 } if args.rules: logger.info( '\nSetting up the evaluation environment against the rules-based agent...' ) # Evaluate against a 'rules' agent as well eval_actual_callback = EvalCallback( eval_env=selfplay_wrapper(base_env)(opponent_type='rules', verbose=args.verbose), eval_freq=1, n_eval_episodes=args.n_eval_episodes, deterministic=args.best, render=True, verbose=0) callback_args['callback_on_new_best'] = eval_actual_callback # Evaluate the agent against previous versions eval_callback = SelfPlayCallback(args.opponent_type, args.threshold, args.env_name, **callback_args) logger.info('\nSetup complete - commencing learning...\n') model.learn(total_timesteps=int(1e9), callback=[eval_callback], reset_num_timesteps=False, tb_log_name="tb") env.close() del env