def test_recurrent_eval_callback(): env_id = 'Pendulum-v0' # Create envs env = make_vec_env(env_id, n_envs=4) eval_env = make_vec_env(env_id, n_envs=1) # Create RL model model = PPO2('MlpLstmPolicy', env) # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, best_model_save_path=LOG_FOLDER, log_path=LOG_FOLDER, eval_freq=100) model.learn(300, callback=eval_callback) # Cleanup if os.path.exists(LOG_FOLDER): shutil.rmtree(LOG_FOLDER)
def create_model(self, n_envs=1): """ Create env and agent model """ env_cls = SprEnv self.env = make_vec_env(env_cls, n_envs=n_envs, env_kwargs={"params": self.params}, seed=self.params.seed) self.model = ACKTR( self.policy, self.env, gamma=self.params.agent_config['gamma'], n_steps=self.params.agent_config['n_steps'], ent_coef=self.params.agent_config['ent_coef'], vf_coef=self.params.agent_config['vf_coef'], vf_fisher_coef=self.params.agent_config['vf_fisher_coef'], max_grad_norm=self.params.agent_config['max_grad_norm'], learning_rate=self.params.agent_config['learning_rate'], gae_lambda=self.params.agent_config['gae_lambda'], lr_schedule=self.params.agent_config['lr_schedule'], kfac_clip=self.params.agent_config['kfac_clip'], kfac_update=self.params.agent_config['kfac_update'], async_eigen_decomp=self.params.agent_config['async_eigen_decomp'], verbose=self.params.agent_config['verbose'], tensorboard_log="./tb/acktr/", seed=self.params.seed, policy_kwargs={"params": self.params})
def get_intrinsic_reward(base_index): intrinsic_rewards = [[] for _ in range(len(subenv_dict))] # base env base_name = subenv_dict[base_index] base_env = make_vec_env(f"selected-bipedal-{base_name}-v0", n_envs=1, seed=seed) base_agent = ACKTR.load(f"./base_agent/{base_name}/model.zip") # rnd model rnd_dict = {} for client_env in subenv_dict.values(): rnd = RandomNetworkDistillation(input_size=24) rnd.load(f"./base{base_index}_client_model/{client_env}/rnd") rnd_dict[client_env] = rnd obs = base_env.reset() for _ in range(num_test): for i, client_env in subenv_dict.items(): intrinsic_rewards[i].append( rnd_dict[client_env].get_intrinsic_reward(obs)) action = base_agent.predict(obs) obs, reward, done, info = base_env.step(action[0]) if done: obs = base_env.reset() return intrinsic_rewards
def main_exp(arg): env = DMP_simulator_3d_dynamic_triangle.deep_mobile_printing_3d1r( plan_choose=arg["plan_choose"]) env = make_vec_env(lambda: env, n_envs=1) policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[512, 512, 512]) model = PPO2(MlpPolicy, env, policy_kwargs=policy_kwargs, gamma=arg["gamma"], n_steps=arg["n_steps"], noptepochs=arg["noptepochs"], ent_coef=arg["ent_coef"], learning_rate=arg["learning_rate"], vf_coef=arg["vf_coef"], cliprange=arg["cliprange"], nminibatches=arg["nminibatches"], verbose=1, tensorboard_log=arg["tensorboard_log"], n_cpu_tf_sess=1, seed=arg["seed"]) time_steps = 1e7 model.learn(total_timesteps=int(time_steps), tb_log_name=arg["tb_log_name"]) model.save(arg["model_save_path"]) return model
def make_alrs_env(args, test=False, baseline=False): """ Make a new ALRS environment with parameters specified as command line arguments. """ from environment import AdaptiveLearningRateOptimizer env = make_vec_env( env_id=AdaptiveLearningRateOptimizer, n_envs=1 if test else args.num_envs, env_kwargs={ 'dataset': args.dataset, 'architecture': args.architecture, 'batch_size': args.batch_size, 'update_freq': args.update_freq, 'num_train_steps': args.num_train_steps, 'initial_lr': args.initial_lr, 'discrete': args.discrete, 'action_range': np.inf if baseline else args.action_range, 'lr_noise': not (test or baseline) } ) env = VecNormalize( venv=env, norm_obs=args.ppo2_norm_obs, norm_reward=args.ppo2_norm_reward, clip_obs=args.ppo2_cliprange if args.ppo2_cliprange > 0 else 10, clip_reward=args.ppo2_cliprange if args.ppo2_cliprange > 0 else 10, gamma=args.ppo2_gamma ) env.alrs = env.venv.envs[0].env return env
def create_env(n_envs, env_name=None, log_dir=None): return VecNormalize(make_vec_env(ENVS[env_name][env_id], n_envs=n_envs, env_kwargs=ENVS[env_name][env_kwargs], monitor_dir=log_dir), norm_obs=False, norm_reward=True)
def run_stable_baselines( reward_config_file, hysr_one_ball_config_file, ppo_config_file, log_episodes=False, log_tensorboard=False, ): from stable_baselines.common.policies import MlpPolicy from stable_baselines.common import make_vec_env from stable_baselines import PPO2 env_config = { "reward_config_file": reward_config_file, "hysr_one_ball_config_file": hysr_one_ball_config_file, "log_episodes": log_episodes, "log_tensorboard": log_tensorboard, } env = make_vec_env(HysrOneBallEnv, env_kwargs=env_config) ppo_config = PPOConfig.from_json(ppo_config_file) if log_tensorboard: model = PPO2(MlpPolicy, env, verbose=1, log_tensorboard=log_tensorboard, **ppo_config) else: model = PPO2(MlpPolicy, env, verbose=1, **ppo_config) model.learn(total_timesteps=1000000) model.save("ppo2_hysr_one_ball")
def train(timesteps=TIMESTEPS): print( f"[INFO] STARTING TRAINING: {START_TIME} {ENVIRONMENT}-{POLICY_NAME}-{ALGO}" ) print(f"[INFO] NETWORK ARCH {NETWORK_ARCH}") # use vectorized environments for the appropriate algorithms for a speed boost env = make_vec_env(ENVIRONMENT, NUM_ENVS) # the network architecture can be defined above for any policy policy_kwargs = dict(net_arch=NETWORK_ARCH) model = PPO2(policy=POLICY, env=env, verbose=0, policy_kwargs=policy_kwargs, tensorboard_log=TENSORBOARD_DIR, n_steps=1, learning_rate=LEARNING_RATE) if LOAD_MODEL: model.load(load_path=LOAD_DIR) print(f"[INFO] Training for TIMESTEPS {TIMESTEPS}") model.learn(total_timesteps=timesteps, log_interval=LOG_INTERVAL, tb_log_name=TB_LOG_NAME) # experiment select print("[INFO] Done training") model.save(save_path=MODEL_DIR, cloudpickle=False) print(f"[INFO] MODEL SAVED TO {MODEL_DIR}") return 0
def fed_and_eval(base_index, w): base_env = make_vec_env(f"selected-bipedal-{subenv_dict[base_index]}-v0", n_envs=1, seed=seed) base_agent = ACKTR.load( f"./base_agent/{subenv_dict[base_index]}/model.zip") base_parameter_dict = base_agent.get_parameters() sub_model_parameters = [] for subenv in subenv_dict.values(): client_policy = ACKTR.load( f"./base{base_index}_client_model/{subenv}/policy.zip") sub_model_parameters.append(client_policy.get_parameters()) aligned_agent = base_agent base_parameter_dict = aligned_agent.get_parameters() model_align(w, base_parameter_dict, sub_model_parameters, alpha=alpha) aligned_agent.load_parameters(base_parameter_dict) avg_reward, reward_std = evaluate_policy(aligned_agent, base_env, n_eval_episodes=100) print(f"base {base_index}, weight {w} done") return (avg_reward, reward_std)
def test(args): print("testing the trained environment") env_info = { "args": args, "external_func": ext_func_list, "params": request_params } env = make_vec_env(tradingEnv.TradingEnvironment, n_envs=args.num_envs, env_kwargs={"env_info": env_info}) #Constants for saving logs and models exp_name = args.exp_name save_dir = os.path.join(BASE_PATH, 'logs_models', exp_name) model_name = os.path.join(save_dir, "Trading_exp_1_finished") model = PPO2.load(model_name) obs = env.reset() #Test for n steps for i in range(1000): # test_model_load.start_innvestigate(new_model, obs) action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) if dones: print("RESET") if args.visualize: env.render()
def createVectorizedEnv(): # Aguarda scripts iniciarem. # Fluxo sai de H1 e vai para H2 env = LoadBalanceEnvDiscAction(source_port_index=0, source_switch_index=0, target_port_index=0, target_switch_index=2) env = make_vec_env(lambda: env, n_envs=1) return env
def run_stable(num_steps, save_dir): env = make_vec_env(BBall3Env, n_envs=1, monitor_dir=save_dir, env_kwargs=env_config) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.5 * np.ones(n_actions)) model = TD3( MlpPolicy, env, action_noise=action_noise, verbose=1, gamma=0.99, buffer_size=1000000, learning_starts=10000, batch_size=100, learning_rate=1e-3, train_freq=1000, gradient_steps=1000, policy_kwargs={"layers": [64, 64]}, n_cpu_tf_sess=1, ) num_epochs = 1 total_steps = 5e5 for epoch in range(num_epochs): model.learn(total_timesteps=int(total_steps / num_epochs)) model.save(save_dir + "/model.zip")
def run_stable(num_steps, save_dir): env = make_vec_env(BBall3Env, n_envs=8, monitor_dir=save_dir, env_kwargs=env_config) # env = VecNormalize(env) model = PPO2( MlpPolicy, env, verbose=1, seed=int(seed), # normalize = True # policy = 'MlpPolicy', n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=2.5e-4, cliprange=0.2, cliprange_vf=-1, ) num_epochs = 5 for epoch in range(num_epochs): model.learn(total_timesteps=int(num_steps / num_epochs)) model.save(save_dir + "/model.zip")
def retrain(self, env, timesteps, modelpath, tensorboard_logs_path): model = self.load(modelpath, tensorboard_logs_path) env_str = self.get_env_str(env) model.set_env(make_vec_env(env_str, n_envs=8)) model.learn(total_timesteps=timesteps, reset_num_timesteps=False, tb_log_name="TB_"+datetime.now().strftime('%d%m%y-%H%M')) #Continue training model.save(modelpath) return model
def __init__(self, env, output, train_steps, total_time_steps, session, paddle_length_factor=None, paddle_speed_factor=None, ball_speed_factor=None): """ The constructor of the class 'JuPong2D_PPO2' creates a vectorized Gym-Environment with a specific parameter set. The neuronal networks will be saved in the output folder after every 'total_time_steps' step. For a more accurate training, a parameter 'session' will be used to train the same model multiple times. :param env: The Gym-Environment to load :param output: The output folder for the neuronal networks :param total_time_steps: Training duration before saving :param session: Session-ID for a specific training configuration :param paddle_length_factor: Factor for the paddle length :param paddle_speed_factor: Factor for the paddle speed :param ball_speed_factor: Factor for the ball speed """ self.train_steps = train_steps self.total_time_steps = total_time_steps self.env_name = env self.session = session self.env = make_vec_env(self.env_name, n_envs=4) self.output = output self.paddle_length_factor = paddle_length_factor self.paddle_speed_factor = paddle_speed_factor self.ball_speed_factor = ball_speed_factor self.save_name = "stablebl_ppo2_save" self.create_save_folder() self.make_save_path() self.create_model()
def train_with_forward_search(env_name, pop_size, total_timesteps, train_timesteps, LOGS, FORWARD_SEARCH_MODEL, args, seed): env = make_vec_env(env_name, n_envs=args.n_envs) model = PPO(MlpPolicy, env, n_steps=args.n_steps, nminibatches=args.nminibatches, noptepochs=args.noptepochs, ent_coef=args.ent_coef, learning_rate=args.learning_rate, lam=args.lam, gamma=args.gamma, cliprange=args.cliprange, cliprange_vf=args.cliprange_vf) # model = PPO(MlpPolicy, env, seed=seed) timesteps = [] mean_reward = [] std_reward = [] if os.path.exists(os.path.join(LOGS, 'forward_search_train_stats.npz')): train_stats = np.load(os.path.join(LOGS, 'forward_search_train_stats.npz')) pid = int(train_stats['pid']) completed_steps = int(train_stats['completed_steps']) model = PPO.load(FORWARD_SEARCH_MODEL, env=env, pid=pid) print("Loading forward search model with pid:{}, completed_steps:{}".format(pid, completed_steps)) else: pid = os.getpid() completed_steps = 0 epochs = total_timesteps//train_timesteps model.save(FORWARD_SEARCH_MODEL, pid=pid) print("Running forward search with population size: {}, epochs: {}".format(pop_size, epochs)) print("PID:{}".format(pid)) for epoch in range(completed_steps // train_timesteps, total_timesteps//train_timesteps): with mp.get_context("spawn").Pool(pop_size) as pool: pooled_results = pool.starmap(forward_search, ((train_timesteps, env_name, args.n_envs, FORWARD_SEARCH_MODEL, seed, pid) for _ in range(pop_size))) pooled_results = np.array(pooled_results) models_parameters = pooled_results[:, 0] process_ids = pooled_results[:, 1] mean_rewards = pooled_results[:, 2] std_rewards = pooled_results[:, 3] # for idx in range(pooled_results.shape[0]): # _, pid, mean, std = pooled_results[idx] # print(pid, mean, std) ind = np.argmax(mean_rewards) print("Epoch:{} Best child index from population: {}, Mean Reward:{}, Std Reward:{}".format(epoch + 1, ind, mean_rewards[ind], std_rewards[ind])) model = PPO.load(FORWARD_SEARCH_MODEL, env=env, pid=process_ids[ind]) model.load_parameters(models_parameters[ind], exact_match=True) model.save(FORWARD_SEARCH_MODEL, pid=pid) timesteps.append((epoch + 1) * train_timesteps) mean_reward.append(mean_rewards[ind]) std_reward.append(std_rewards[ind]) plot_reward(np.array(timesteps), np.array(mean_reward), np.array(mean_reward) - np.array(std_reward), np.array(mean_reward) + np.array(std_reward), figname=os.path.join(LOGS, 'fsepoch{}.png'.format(epoch + 1))) with open(os.path.join(LOGS, 'forward_search.csv'), 'a') as f: csvwriter = csv.writer(f, delimiter=',') csvwriter.writerow([epoch + 1, ind, mean_rewards[ind], std_rewards[ind], pooled_results[:, 1:]]) np.savez_compressed(os.path.join(LOGS, 'train_stats.npz'), timesteps=timesteps, mean_reward=mean_reward, std_reward=std_reward, pid=pid, completed_steps=((epoch + 1) * train_timesteps))
def train(): env = make_vec_env('My-CartPole-v0', n_envs=1) model = PPO2(MlpPolicy, env, verbose=0) n = 250000 model.learn(total_timesteps=n) model.save("./weights/ppo2_cartpole" + str(n))
def parse_hyperparams(args): storage_name = f"sqlite:///tuning_studies/{args.study_name}.db" study = optuna.load_study(study_name=args.study_name, storage=storage_name) trial = study.best_trial params = trial.params # Make the environment depending on number of environments in params try: env = make_vec_env( lambda: gym.make(args.env, **args.env_kwargs), n_envs=params["n_envs"], ) params.pop("n_envs") except: env = gym.make(args.env) # Constructing the network architecture # Mapping net_arch to actual network architectures for SB net_arch = { "small": dict(pi=[64, 64], vf=[64, 64]), "med": dict(pi=[256, 256], vf=[256, 256]), "large": dict(pi=[400, 400], vf=[400, 400]), }[params["net_arch"]] # Creating a custom LSTM policy class CustomLSTMPolicy(LstmPolicy): def __init__( self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=params["n_lstm"], reuse=False, **_kwargs, ): super().__init__( sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, net_arch=[100, "lstm", net_arch], layer_norm=True, feature_extraction="mlp", **_kwargs, ) # Deleting keys that can't be used in SB models keys_to_delete = ["batch_size", "n_lstm", "net_arch", "joker"] if "lambda" in params: keys_to_delete.append("lambda") params["lam"] = params["lambda"] [params.pop(key) for key in keys_to_delete if key in params] return params, CustomLSTMPolicy, env
def _setup(self): # Game parameters self.env = make_vec_env(self.ENV_NAME, n_envs=self.num_envs) self.env.play_type = PLAY_TYPE.MACHINE self.env.render_mode = 'machine' self.env.MAX_TURNS = self.max_turns self.env.reset() # Report success print('Created new environment {0} with GameID: {1}'.format(self.ENV_NAME, self.GAME_ID))
def train(): make_env_def() # multiprocess environment env = make_vec_env('AI4U-v0', n_envs=8) model = PPO2(CustomPolicy, env, verbose=1, n_steps=32, nminibatches=4, tensorboard_log="./logs/") model.learn(total_timesteps=1000000) model.save("ppo2_model") del model # remove to demonstrate saving and loading
def rendu(fichier): env = make_vec_env('CartPoleSwingUpContinuous-v0', n_envs=1) model = PPO2.load(fichier, cloudpickle=False) obs = env.reset() for _ in range(1000000): sleep(0.009) action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action)
def rendu(): env = make_vec_env('My-CartPole-v0', n_envs=1) model = PPO2.load("./weights/ppo2_cartpole250000.zip") obs = env.reset() while True: sleep(0.009) action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action)
def train(args): #Using Stable Baselines """ Train the algorithm (with a given policy) """ env_info = { "args": args, "external_func": ext_func_list, "params": request_params } env = make_vec_env(tradingEnv.TradingEnvironment, n_envs=args.num_envs, env_kwargs={"env_info": env_info}) #env = VecFrameStack(env, n_stack = 4) #Uncomment to enable visualizations! print("Vectorized env created") print("Creating model") #Constants for saving logs and models exp_name = args.exp_name save_dir = os.path.join(BASE_PATH, 'logs_models', exp_name) # Create PPO2 model now model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log=save_dir, full_tensorboard_log=False) # Train the model and save the results print("Training the network") try: #train for first 1 million Epochs steps_per_batch, num_envs = model.n_steps, env.num_envs model.learn(total_timesteps=args.num_epochs, tb_log_name=exp_name, log_interval=10) print("First training done") model.save(save_path=os.path.join(BASE_PATH, 'logs_models', exp_name, exp_name + '_finished')) except Exception as e: print("Exception occured during training", e) model_name = os.path.join(save_dir, "PPO2_error") model.save(model_name) import traceback traceback.print_exc() print("model saved") return
def train(dest_path="ppo2model", logdir='./logs/', pretrainedmodel=None, nsteps = 1, total_timesteps=10000, n_envs=1, verbose=1, nminibatches=4): model = None make_env_def() env = make_vec_env('AI4U-v0', n_envs=n_envs) if pretrainedmodel is not None: model = PPO2.load(pretrainedmodel, policy=CustomPolicy, tensorboard_log=logdir, nminibatches=nminibatches) model.set_env(env) else: model = PPO2(CustomPolicy,env, verbose=verbose, nminibatches=nminibatches, n_steps=nsteps, tensorboard_log=logdir) model.learn(total_timesteps=total_timesteps, reset_num_timesteps=False, tb_log_name=logdir) model.save(dest_path)
def run_openai_baselines( reward_config_file, hysr_one_ball_config_file, ppo_config_file, log_episodes=False, log_tensorboard=False, model_file_path=None, ): import tensorflow as tf from stable_baselines.common import make_vec_env env_config = { "reward_config_file": reward_config_file, "hysr_one_ball_config_file": hysr_one_ball_config_file, "log_episodes": log_episodes, "log_tensorboard": log_tensorboard, } env = make_vec_env(HysrOneBallEnv, env_kwargs=env_config) ppo_config = OpenAIPPOConfig.from_json(ppo_config_file) total_timesteps = ppo_config["num_timesteps"] del ppo_config["num_timesteps"] save_path = ppo_config["save_path"] del ppo_config["save_path"] if ppo_config["activation"] == "tf.tanh": ppo_config["activation"] = tf.tanh alg = "ppo2" learn = get_alg_module_openai_baselines(alg).learn # seed = 123 if model_file_path is None: print("total timesteps:", total_timesteps) model = learn( env=env, # seed=seed, total_timesteps=total_timesteps, **ppo_config) model.save("ppo2_openai_baselines_hysr_one_ball") else: ppo_config["load_path"] = model_file_path model = learn( env=env, # seed=seed, total_timesteps=0, **ppo_config) if save_path: model.save(save_path) print("model saved to", save_path) return model, env
def process_environment(self, ind, paddle_length=None, paddle_speed=None, ball_speed=None): """ This method will be executed by multiple threads. It measures the quality of a neuronal network by analyzing different parameter values of the Gym-Environment JuPong2D. The results are mean-return-values, which will be saved in a csv-file. :param ind: Thread index for a scale factor :param paddle_length: Factor for the paddle length :param paddle_speed: Factor for the paddle speed :param ball_speed: Factor for the ball speed """ env = make_vec_env(self.env_name, n_envs=4) if paddle_length is not None: for gym_env in env.envs: gym_env.scale_paddle_height(paddle_length) print(f"Paddle Length {paddle_length}") elif paddle_speed is not None: for gym_env in env.envs: gym_env.scale_paddle_vel(paddle_speed) print(f"Paddle Speed {paddle_speed}") elif ball_speed is not None: for gym_env in env.envs: gym_env.scale_ball_velocity(ball_speed) print(f"Ball Speed {ball_speed}") else: print("Kein Parameter gesetzt.") return obs = env.reset() return_vals = np.array([0.0, 0.0, 0.0, 0.0]) return_val_arr = [] done_cnt = 0 for _ in range(self.play_steps): while True: action, _states = self.model.predict(obs) obs, rewards, dones, info = env.step(action) return_vals += rewards for i, done in enumerate(dones): if done: done_cnt += 1 return_val_arr.append(return_vals[i]) return_vals[i] = 0.0 self.return_arr[ind] = np.mean(return_val_arr) print(self.return_arr) with open(self.save_file, 'w') as my_file: writer = csv.writer(my_file) writer.writerow(self.scale_factor_arr) writer.writerow(self.return_arr) if done_cnt >= 4: break
def forward_search(trained_timesteps, env_name, n_envs, save_file, seed, pid): env = make_vec_env(env_name, n_envs=n_envs) model = PPO.load(save_file, env=env, pid=pid) model = model.learn(trained_timesteps, tb_log_name="PPO", reset_num_timesteps=True) mean, std = test(model, env_name) pid = os.getpid() model.save(save_file, pid=pid) env.close() return [model.get_parameters(), pid, mean, std]
def launchAgent(model_name: str): """ :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함 현재는 의도상 PPO2로 세팅할 것 :return: 1000회의 사이클을 돌고 난 이후의 모델 """ import Reinforcement_AI.env.e_enhanced_image_env as image_env from stable_baselines import DQN, HER, DDPG, PPO2 from stable_baselines.common import make_vec_env print("Current Env is " + model_name) if model_name == "HER": env = image_env.DetailedMiniMapEnv() model = HER("CnnPolicy", env=env, model_class=DQN) if model_name == "DDPG": env = image_env.DDPGImageEnv() model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True) if model_name == "PPO2": env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1) model = PPO2(policy="CnnPolicy", env=env, verbose=1) else: env = image_env.DetailedMiniMapEnv() model = DQN( "CnnPolicy", # policy env=env, # environment double_q=True, # Double Q enable prioritized_replay=True, # Replay buffer enabled verbose=0 # log print ) for i in range(1000): if i != 0: if model_name == "HER": model = HER.load("detailedmap_HER_" + str(i), env) if model_name == "DDPG": model = DDPG.load("detailedmap_DDPG_" + str(i), env) if model_name == "PPO2": model = PPO2.load("detailedmap_PPO2_" + str(i), env) else: model = DQN.load("detailedmap_DQN_" + str(i), env) # print('model learn start') model.learn(total_timesteps=12500) #FPS가 130이상 넘어갈때의 최소수치 print("this model is : detailedmap_" + model_name + "_" + str(i + 1)) # print('model learn finished') # print('model save start') model.save("detailedmap_" + model_name + "_" + str(i + 1)) del model # print('model save end') return model
def retrain(self, env, timesteps, modelpath, tensorboard_logs_path): #Periodically evalute agent, save best model eval_callback = EvalCallback2(env, best_model_save_path='app/model_logs/', log_path='app/model_logs/', eval_freq=1000, deterministic=True, render=False) model = self.load(modelpath, tensorboard_logs_path) env_str = self.get_env_str(env) model.set_env(make_vec_env(env_str, n_envs=8)) model.learn(total_timesteps=timesteps, callback=eval_callback, reset_num_timesteps=False, tb_log_name="TB_"+datetime.now().strftime('%d%m%y-%H%M')) #Continue training model.save(modelpath) return model
def test(): make_env_def() # multiprocess environment env = make_vec_env('AI4U-v0', n_envs=8) model = PPO2.load("ppo2_model_baked", policy=CustomPolicy, tensorboard_log="./logs/") model.set_env(env) # Enjoy trained agent obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action)