def train(env, log_dir): callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) env = VecNormalize(env, training=True, norm_obs=True, norm_reward=True, gamma=0.9997, clip_obs=10., clip_reward=10., epsilon=0.1) drive = PPO("MlpPolicy", env, ent_coef=0.01, vf_coef=1, batch_size=32, learning_rate=linear_schedule(0.001), clip_range=linear_schedule(0.1), n_steps=1000, n_epochs=20, tensorboard_log=log_dir + "/drive_tensorboard_log", verbose=1) drive.learn(total_timesteps=total_timesteps, callback=callback) for i in range(total_train_runs): env.close() drive.learn(total_timesteps=total_timesteps, callback=callback, reset_num_timesteps=False) drive.save("conduziadrive")
def main(): test_or_train = TEST_OR_TRAIN assert test_or_train in ["train", "test"] gym_config = SimulationParameters(time_step=TIME_STEP) robot_class = QuadrupedRobot robot_params = MiniCheetahParams( on_rack=False, enable_self_collision=True, motor_control_mode=MotorControlMode.HYBRID_COMPUTED_POS_TROT) task = TestTask(train_or_test=TEST_OR_TRAIN) env = LocomotionGymEnv(gym_config, robot_class, robot_params, task) policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/policies') if not (os.path.exists(policy_save_dir)): os.makedirs(policy_save_dir) policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") policy_save_path = os.path.join(policy_save_dir, policy_save_filename) if TEST_OR_TRAIN == "train": model = PPO('MlpPolicy', env, verbose=1) model.learn(total_timesteps=100000000) model.save(policy_save_path) else: model = PPO.load(POLICY_SAVE_PATH) obs = env.reset() while True: action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
def main(): tensorboard_log = "./log" env = Pinokio3() # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) if os.path.exists( save_file ): model = PPO.load( save_file, env=DummyVecEnv([lambda:env]),tensorboard_log=tensorboard_log ) else: policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=net_arch) model = PPO(MlpPolicy, DummyVecEnv([lambda:env]), verbose=1,tensorboard_log=tensorboard_log) #https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./checkpoints/', name_prefix='pinokio3') while True: model.learn(total_timesteps=15000000, callback=checkpoint_callback, tb_log_name=tb_log_name ) model.save( save_file ) print( "saved" ) obs = env.reset() for i in range(20): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) print( "action {} -> reward {}".format( env.decode_action(action), reward ) ) env.render() if done: print( "resetting because " + str(done) ) env.reset()
def train_pa_ppo(path='pa_ppo'): """ 1/3の確率で出を出す環境での学習を行う。 引数: path 学習済みモデルファイルパス 戻り値: なし """ print(f'train ppo with jurina_player path={path}') # じゃんけん環境の構築 env = RockPaperScissorsEnv(JurinaPlayer()) env = Monitor(env, LOGDIR, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # PPOモデルの初期化 model = PPO('MlpPolicy', env, verbose=1) # トレーニング実行 elapsed = time.time() model.learn(total_timesteps=1000000) print(f'elapse time: {time.time() - elapsed}sec') # 学習済みモデルの保存 model.save(path) # じゃんけん環境のクローズ env.close()
def main(): tensorboard_log = "./log" env = Pinokio5() # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) if os.path.exists(save_file): model = PPO.load(save_file, env=DummyVecEnv([lambda: env]), tensorboard_log=tensorboard_log) else: model = PPO(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log) try: while True: #model.learn(total_timesteps=10000) model.learn(total_timesteps=8000000, tb_log_name=tb_log_name) model.save(save_file) obs = env.reset() for i in range(100): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) env.render() if done: print("resetting because " + str(done)) env.reset() except KeyboardInterrupt: print("Saving before exiting...") model.save(save_file) print("k bye")
def train(): omniverse_kit = OmniKitHelper(CUSTOM_CONFIG) # we disable all anti aliasing in the render because we want to train on the raw camera image. omniverse_kit.set_setting("/rtx/post/aa/op", 0) env = TestEnv(omniverse_kit, max_resets=10, updates_per_step=3) checkpoint_callback = CheckpointCallback(save_freq=1000, save_path="./params/", name_prefix="rl_model") net_arch = [512, 256, dict(pi=[128, 64, 32], vf=[128, 64, 32])] policy_kwargs = { "net_arch": net_arch, "features_extractor_class": CustomCNN, "activation_fn": torch.nn.ReLU } model = PPO("CnnPolicy", env, verbose=1, tensorboard_log="tensorboard", policy_kwargs=policy_kwargs, device="cuda") # model = PPO.load("checkpoint_25k.zip",env) model.learn( total_timesteps=25000, callback=checkpoint_callback, eval_env=env, eval_freq=1000, eval_log_path="./eval_log/", reset_num_timesteps=False, ) model.save("checkpoint_25k")
def trained_agent(episodes=256, continuous=True, load=None, save_name="test", ent_coef=0.00001, total_timesteps=25000, learning_rate=lr()): env = gym.make("bilboquet-v0", continuous=continuous, amplitude=10) env.reset((300, 300)) if load is None: model = PPO('MlpPolicy', env, verbose=1, ent_coef=ent_coef, learning_rate=learning_rate, tensorboard_log=f"./ppo_bilboquet_tensorboard/") model.learn(total_timesteps=total_timesteps, tb_log_name=save_name) model.save(save_name + '.zip') print('DONE') obs = env.reset() else: model = PPO.load(load) obs = env.reset() for i in range(episodes): action, _states = model.predict(obs, deterministic=True) # print(action) obs, reward, done, info = env.step(action) # print(reward) env.render() if done: obs = env.reset()
def save_new_model(name, env, num_envs, model_dir, batch_size=None, n_steps=None, n_epochs=None, clip_range=None, gamma=None, gae_lambda=None, vf_coef=None, ent_coef=None, learning_rate=None, image_based=False, image_pretrain=None, verbose=0, w=.1): if not batch_size: batch_size = choose_hyperp("batch_size", 10, w=w) if not n_steps: n_steps = max(batch_size, choose_hyperp("n_steps", 10, w=w))//num_envs if not n_epochs: n_epochs = choose_hyperp("n_epochs", 2, w=w) if not clip_range: clip_range = choose_hyperp("clip_range", 1, w=w) if not gamma: gamma = choose_hyperp("gamma", 2, w=w) if not gae_lambda: gae_lambda = choose_hyperp("gae_lambda", 1, w=w) if not vf_coef: vf_coef = choose_hyperp("vf_coef", 0, w=w) if not ent_coef: ent_coef = choose_hyperp("ent_coef", 0, w=w) if not learning_rate: learning_rate = choose_hyperp("learning_rate", 5, w=w) feature_extractor = "MlpPolicy" if image_based: feature_extractor = "CnnPolicy" model = PPO(feature_extractor, env, batch_size=batch_size, n_steps=n_steps, n_epochs=n_epochs, clip_range=clip_range, gamma=gamma, gae_lambda=gae_lambda, vf_coef=vf_coef, ent_coef=ent_coef, learning_rate=learning_rate, verbose=verbose) if image_based and image_pretrain: model.policy.features_extractor.cnn.load_state_dict(T.load(image_pretrain+"_cnn.pth")) model.policy.features_extractor.linear.load_state_dict(T.load(image_pretrain+"_linear.pth")) model.save(model_dir + name + '/' + name + "_0") return model
def ppo_stable_baselines_training(): wandb.run = config.tensorboard.run wandb.tensorboard.patch(save=False, tensorboardX=True) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) envs = make_vec_env(config.env_name, n_envs=config.num_processes) model = PPO("CnnPolicy", envs, verbose=1, tensorboard_log="./runs/", clip_range=config.clip_param, n_steps=50, learning_rate=config.lr, gamma=config.gamma, gae_lambda=config.gae_lambda, ent_coef=config.entropy_coef, max_grad_norm=config.max_grad_norm, vf_coef=config.value_loss_coef, batch_size=config.num_mini_batch) model.learn(total_timesteps=config.num_steps, log_interval=1, callback=WandbStableBaselines3Callback()) model.save(f"{config.env_name}_stable_baselines_ppo")
def train_rl(self, models_to_train=40, episodes_per_model=100, path='./logs/'): # specify the RL algorithm to train (eg ACKTR, TRPO...) # Callback for saving the best agent during training eval_callback = EvalCallback(self.env, best_model_save_path=path, log_path=path, eval_freq=500, deterministic=True, render=False) model = PPO(MlpPolicy, self.env, verbose=1, learning_rate=0.0003, tensorboard_log=path) start = time.time() for i in range(models_to_train): steps_per_model = episodes_per_model * self.param.steps_per_episode model.learn(total_timesteps=steps_per_model, callback=eval_callback) model.save("MODEL_" + str(i)) end = time.time() print("time (min): ", (end - start) / 60)
def run(config: Dict[str, Any], logdir: pathlib.PosixPath): env = make_env(config) if config["mode"] == "evaluate": print("Start evaluation.") model = PPO.load(logdir / "model.zip") elif config["mode"] == "train" and args.logdir: print("Start training from existing model.") model = PPO.load(logdir / "model.zip") model.set_env(env) model.learn(total_timesteps=config["train_steps"]) else: print("Start training.") model = PPO( "CnnPolicy", env, verbose=1, tensorboard_log=logdir / "tensorboard", use_sde=True, ) model.learn(total_timesteps=config["train_steps"]) mean_reward, std_reward = evaluate_policy( model, env, n_eval_episodes=config["eval_eps"], deterministic=True) print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}") if config["mode"] == "train": model.save(logdir / "model") env.close()
def pybullet_example(): # PyBullet: Normalizing input features import pybullet_envs env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) # Automatically normalize the input features and reward. env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0) model = PPO("MlpPolicy", env) model.learn(total_timesteps=2000) # Don't forget to save the VecNormalize statistics when saving the agent. log_dir = "/tmp/" model.save(log_dir + "ppo_halfcheetah") stats_path = os.path.join(log_dir, "vec_normalize.pkl") env.save(stats_path) # To demonstrate loading. del model, env # Load the saved statistics. env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) env = VecNormalize.load(stats_path, env) # Do not update them at test time. env.training = False # reward normalization is not needed at test time. env.norm_reward = False # Load the agent. model = PPO.load(log_dir + "ppo_halfcheetah", env=env)
def main(): num_cpu = 1 load_version = '' save_version = '1b_v0' load_dir = '../models' save_dir = '../models' timesteps_per_checkpoint = int(1e6) num_checkpoints = int(1e1) # controlling performance level of agent try: os.mkdir(save_dir) except OSError as error: pass alg_env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) print('created alg env') train_policy = 'MlpPolicy' load_path = '{}/alg_v{}.zip'.format(load_dir, load_version) if os.path.exists(load_path): alg = PPO(train_policy, alg_env, verbose=0) alg.set_parameters(load_path, exact_match=True) # alg = PPO.load(load_path, env=alg_env) print('loaded alg checkpoint' + load_path) else: alg = PPO(train_policy, alg_env, verbose=0) print('created alg model') save_path = '{}/alg_v{}.zip'.format(save_dir, save_version) for _ in range(num_checkpoints): alg.learn(total_timesteps=timesteps_per_checkpoint) alg.save(save_path) print('saved alg checkpoint' + save_path)
def main(): # Instantiate the env env = Gaze(fitts_W=fitts_W, fitts_D=fitts_D, ocular_std=ocular_std, swapping_std=swapping_std) env = Monitor(env, log_dir) # Train the agent model = PPO('MlpPolicy', env, verbose=0, clip_range=0.15) ''' # Save a checkpoint periodically save_feq_n=timesteps/10 checkpoint_callback = CheckpointCallback(save_freq=save_feq_n, save_path=f'{log_dir}savedmodel/', name_prefix='model_ppo') ''' # Train the agent model.learn(total_timesteps=int(timesteps), callback=checkpoint_callback) # Save the model model.save(f'{log_dir}savedmodel/model_ppo') # Plot the learning curve plot_results2(log_dir) save_learned_behaviour()
def main(): env = Pinokio2() # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) if os.path.exists(save_file): model = PPO.load(save_file, env=DummyVecEnv([lambda: env])) else: model = PPO(MlpPolicy, env, verbose=1) while True: #model.learn(total_timesteps=10000) model.learn(total_timesteps=100000) model.save(save_file) obs = env.reset() for i in range(10): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) env.render() if done: print("resetting because " + str(done)) env.reset()
def learn(env_name, save_file, total_timesteps): env = DummyVecEnv([lambda: gym.make(env_name)]) model = PPO(CnnPolicy, env, verbose=1) model.learn(total_timesteps=total_timesteps) model.save(save_file) del model env.close()
def main(): # multiprocess environment # n_cpu = 8 # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)]) # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) n_cpu = 1 env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) model = PPO('MlpPolicy', env, verbose=1, n_steps=int(4096 / n_cpu), wandb_use=False) model.learn(total_timesteps=40000000) file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now()) model.save(file_name) env.save(file_name + "_env.pkl") model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) del model # remove to demonstrate saving and loading del env # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089" env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize.load(file_name + "_env.pkl", env) env.training = False model = PPO.load(file_name, env=env, wandb_use=False) #Enjoy trained agent obs = np.copy(env.reset()) epi_reward = 0 while True: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render() epi_reward += rewards if dones: print("Episode Reward: ", epi_reward) epi_reward = 0
def train_lunarlander_expert(): env = make_vec_env('LunarLander-v2', n_envs=16) # Used default hyperparams as tuned seemed to not work that well. model = PPO('MlpPolicy', env, verbose=1, policy_kwargs=dict(net_arch=[64, 64])) model.learn(total_timesteps=2e6) model.save("experts/LunarLander-v2/lunarlander_expert") gen_expert_demos('LunarLander-v2', gym.make('LunarLander-v2'), model, 25)
def __call__(self): policy_kwargs = dict(activation_fn=th.nn.ReLU) model = PPO('CnnPolicy', self.env, learning_rate=1e-3, policy_kwargs=policy_kwargs).learn(self.total_time_steps) model.save('PPO_' + self.game_name) del model # since the model has been trained, its no longer needed any more...
def train_ppo(config): print("RUNNING PPO") env = get_env() callback = ReportCallback() # model = PPO('MlpPolicy', env, verbose=1) model = PPO('CnnPolicy', env, verbose=1, **config) model.learn(total_timesteps=1000000, callback=callback) model.save("ppo_pong") return
def train(): """Trains a PPO2 policy.""" env_args = env_parser.parse_known_args()[0] policy_args = policy_parser.parse_known_args()[0] opt_args = opt_parser.parse_known_args()[0] os.makedirs(opt_args.save_path, exist_ok=True) # create environment # train_env = GFootballEnv(env_args) # for evaluation train_env = DummyVecEnv([ make_env(env_args, opt_args.save_path, rank=i) for i in range(opt_args.num_envs) ]) eval_env = GFootballEnv(env_args) # for evaluation check_env(env=eval_env, warn=True) # define rl policy/value network policy = getattr(sys.modules[__name__], policy_args.policy) # initialize ppo tb_dir = os.path.join(opt_args.save_path, "tensorboard") os.makedirs(tb_dir, exist_ok=True) verbose = 1 ppo = PPO(policy, train_env, learning_rate=opt_args.lr, n_steps=opt_args.n_steps, n_epochs=opt_args.n_epochs, gamma=opt_args.gamma, gae_lambda=0.95, clip_range=opt_args.clip_range, clip_range_vf=None, ent_coef=opt_args.ent_coef, vf_coef=opt_args.vf_coef, max_grad_norm=opt_args.max_grad_norm, tensorboard_log=tb_dir, verbose=verbose, seed=opt_args.seed) # load initial checkpoint if opt_args.load_path: ppo.load(os.path.join(opt_args.load_path, "ppo_gfootball.pt")) # start training ppo eval_dir = os.path.join(opt_args.save_path, "eval") os.makedirs(eval_dir, exist_ok=True) ppo.learn(opt_args.num_timesteps, log_interval=1, eval_env=eval_env, eval_freq=opt_args.save_interval, n_eval_episodes=10, eval_log_path=eval_dir) # save final checkpoint ppo.save(os.path.join(opt_args.save_path, "ppo_gfootball"))
def train(time_steps, save=False, **params): env = PPOAgent.create_env(1) model = PPO('CnnPolicy', env, verbose=params.get('verbose', 1), tensorboard_log=TB_LOGS) model.learn(total_timesteps=time_steps) if save: model.save(MODEL_PATH)
def train_cartpole_expert(): env = make_vec_env('CartPole-v1', n_envs=8) model = PPO('MlpPolicy', env, verbose=1, n_steps=32, batch_size=256, gae_lambda=0.8, gamma=0.98, n_epochs=20, ent_coef=0.0, learning_rate=linear_schedule(0.001), clip_range=linear_schedule(0.2), policy_kwargs=dict(net_arch=[64, 64])) model.learn(total_timesteps=1e5) model.save("experts/CartPole-v1/cartpole_expert") gen_expert_demos('CartPole-v1', gym.make('CartPole-v1'), model, 25)
class Agent(object): def __init__(self, env, model=None): if model: self.model = model else: self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace( ":", "-") os.makedirs(self.log_dir, exist_ok=True) monitor_env = Monitor(env, self.log_dir, allow_early_resets=True) vec_env = DummyVecEnv([lambda: monitor_env]) policy_kwargs = dict( features_extractor_class=CustomCNN, features_extractor_kwargs=dict(features_dim=256), net_arch=[dict(pi=[64, 64], vf=[64, 64])]) self.model = PPO(CustomCnnPolicy, vec_env, policy_kwargs=policy_kwargs, verbose=1, learning_rate=0.001) def function(self, obs, conf): import random col, _ = self.model.predict(np.array(obs['board']).reshape( 6, 7, 1)) # TODO: Connect-4 specific so far is_valid = (obs['board'][int(col)] == 0) if is_valid: return int(col) else: return random.choice([ col for col in range(config.columns) if obs.board[int(col)] == 0 ]) def train(self, timesteps): self.model.learn(total_timesteps=timesteps) def save(self, name: str): self.model.save(name) def load(self, name: str, env, replace_parameters=None): self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace( ":", "-") os.makedirs(self.log_dir, exist_ok=True) monitor_env = Monitor(env, self.log_dir, allow_early_resets=True) vec_env = DummyVecEnv([lambda: monitor_env]) self.model = PPO.load(name, env=vec_env, custom_objects=replace_parameters) def plot(self): # Plot cumulative reward with open(os.path.join(self.log_dir, "monitor.csv"), 'rt') as fh: firstline = fh.readline() assert firstline[0] == '#' df = pd.read_csv(fh, index_col=None)['r'] df.rolling(window=1000).mean().plot() plt.show()
def train_pendulum_expert(): env = make_vec_env('Pendulum-v0', n_envs=8) model = PPO('MlpPolicy', env, verbose=1, n_steps=2048, batch_size=64, gae_lambda=0.95, gamma=0.99, n_epochs=10, ent_coef=0.0, learning_rate=3e-4, clip_range=0.2, policy_kwargs=dict(net_arch=[256, 256])) model.learn(total_timesteps=2e6) model.save("experts/Pendulum-v0/pendulum_expert") gen_expert_demos('Pendulum-v0', gym.make('Pendulum-v0'), model, 25)
def main(): # Create the callback: check every 1000 steps log_dir = 'log' callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) num_cpu = 16 model_stats_path = os.path.join(log_dir, "sac_" + env_name) env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl') tb_log = 'tb_log' videoName = '5M_timesteps_sac' tb_log_name = videoName if(StartFresh): # env = make_vec_env(env_name, n_envs=4) # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch':[128,64,32], } model = PPO('MlpPolicy', env, learning_rate = 0.001, n_steps=500, # batch_size=0, # n_epochs=1, gamma=0.9, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize.load(env_stats_path, env) env.reset() model = PPO.load(model_stats_path, tensorboard_log=tb_log) model.set_env(env) if(DoTraining): eval_env = make_vec_env(env_name, n_envs=1) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log) model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback() # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): # mean_reward, std_reward = evaluate_policy(model, eval_env) # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}") record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
def train_PPO(env_train, model_name, timesteps=50000): """PPO model""" start = time.time() model = PPO('MlpPolicy', env_train) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (PPO): ', (end - start) / 60, ' minutes') return model
def train_ppo(itr=0, timesteps=1e7, use_dummy_video = True): env = flappy_env.FlappyEnv(use_dummy_video) env = Monitor(env, f"flappy_ppo_{itr}") obs = env.reset() model = PPO( "CnnPolicy", env, verbose=1, learning_rate=1e-5, tensorboard_log = f"./ppo_flappy_tensorboard_{itr}/") model.learn(total_timesteps = timesteps) model.save(f"ppo_flappy_{itr}")
def train_ppo(): log_dir = f"model_save/" env = ENV(istest=False) env = Monitor(env, log_dir) env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) model = PPO("CnnPolicy", env, policy_kwargs=policy_kwargs, verbose=1, batch_size=2048, seed=1) callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir) model.learn(total_timesteps=int(4800), callback = callback, log_interval = 480) model.save('model_save/PPO')
def train(): env = gym.make(TRAIN_ENV) env = PovOnlyObservation(env) env = ActionShaping(env, always_attack=True) # For all the PPO hyperparameters you could tune see this: # https://github.com/DLR-RM/stable-baselines3/blob/6f822b9ed7d6e8f57e5a58059923a5b24e8db283/stable_baselines3/ppo/ppo.py#L16 model = PPO("CnnPolicy", env, verbose=1) model.learn(total_timesteps=TRAIN_TIMESTEPS) # 2m steps is about 8h at 70 FPS model.save(TRAIN_MODEL_NAME) env.close()