'return_info': True, 'train': False } eval_env = make_vec_env("L5-CLE-v0", env_kwargs=eval_env_kwargs, n_envs=args.n_eval_envs, vec_env_cls=SubprocVecEnv, vec_env_kwargs={"start_method": "fork"}) # callbacks # Note: When using multiple environments, each call to ``env.step()`` # will effectively correspond to ``n_envs`` steps. # To account for that, you can use ``save_freq = max(save_freq // n_envs, 1)`` # Save Model Periodically checkpoint_callback = CheckpointCallback(save_freq=(args.save_freq // args.n_envs), save_path=args.save_path, name_prefix=args.output) # Eval Model Periodically eval_callback = L5KitEvalCallback( eval_env, eval_freq=(args.eval_freq // args.n_envs), n_eval_episodes=args.n_eval_episodes, n_eval_envs=args.n_eval_envs, enable_scene_type_aggregation=args.enable_scene_type_aggregation, scene_id_to_type_path=args.scene_id_to_type_path) # train model.learn(args.n_steps, callback=[checkpoint_callback, eval_callback])
# Here we are also multi-worker training (n_envs=4 => 4 environments), The model must support Multi Processing env = make_atari_env(atari_env_name, n_envs=1, seed=0) # Frame-stacking with 4 frames. Με 1 frame ο αλγόριθμος ξέρει τη θέση των πραγμάτων, με 2 frames την ταχύτητα, με 3 την επιτάχυνση και με 4 το jerk env = VecFrameStack(env, n_stack=4) # Test environment must be unique test_env = make_atari_env(atari_env_name, n_envs=1, seed=0) # Frame-stacking with 4 frames test_env = VecFrameStack(test_env, n_stack=4) model_name='ppo-MlpPolicy' time_stamp=datetime.datetime.now().strftime("-%Y%m%d-%H%M%S") model_log= LOG_DIR + model_name + time_stamp ppo_model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=model_log, ) max_steps=10000 ppo_model.learn(total_timesteps=max_steps) # %tensorboard --logdir {LOG_DIR} from stable_baselines3.common.callbacks import StopTrainingOnMaxEpisodes callback = StopTrainingOnMaxEpisodes mean_reward, std_reward = evaluate_policy(ppo_model, test_env, callback=callback, n_eval_episodes=10) print(f"Eval reward: {mean_reward} (+/-{std_reward})") record_video(test_env, ppo_model, video_length=5000, prefix='ppo_BerzerkDeterministic-v4') show_videos(video_path = video_folder, prefix='ppo') ppo_model.save("a2c_BerzerkDeterministic-v4") obs = env.reset()
run_id = str(uuid.uuid4()) # ALL running environments must share this print(f"RUN ID: {run_id}") # to pass launch args, add to env_kwargs: 'launch_args': ['render:=false', 'plot_log:=true'] env = make_vec_env(RocketLeagueInterface, env_kwargs={'run_id': run_id}, n_envs=24, vec_env_cls=SubprocVecEnv) model = PPO("MlpPolicy", env) # log training progress as CSV log_dir = expanduser(f'~/catkin_ws/data/rocket_league/{run_id}') logger = configure(log_dir, ["stdout", "csv", "log"]) model.set_logger(logger) # log model weights freq = 20833 # save 20 times # freq = steps / (n_saves * n_envs) callback = CheckpointCallback(save_freq=freq, save_path=log_dir) # run training steps = 240000000 # 240M (10M sequential) print(f"training on {steps} steps") model.learn(total_timesteps=steps, callback=callback) # save final weights print("done training") model.save(log_dir + "/final_weights") env.close() # this must be done to clean up other processes
from callback import SaveOnBestTrainingRewardCallback from env.env import CitadelsEnv import os log_dir = "/Users/daniel/repos/CitadelsAI/logs" os.makedirs(log_dir, exist_ok=True) env = CitadelsEnv() env = Monitor(env, log_dir) callback = SaveOnBestTrainingRewardCallback(check_freq=100, log_dir=log_dir) # # Learn # model = A2C('MlpPolicy', env, verbose=1) model = PPO('MlpPolicy', env, verbose=1) model = PPO.load("/Users/daniel/repos/CitadelsAI/logs/best_model.zip", env=env) model.learn(total_timesteps=100000, callback=callback) # mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10) # print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") # Play # print('-Play-') # obs = env.reset() # for i in range(100): # action, _state = model.predict(obs, deterministic=True) # obs, reward, done, info = env.step(action) # # env.render() # if done: # obs = env.reset()
continue_training_model_folder, 'vec_normalize_' + continue_training_model_filename + '.pkl') print( f"Continual training on model located at {continue_training_model_path}" ) # Load normalized env env = VecNormalize.load(continue_training_vecnormalize_path, env) # Load model model = PPO.load(continue_training_model_path, env=env) # Training model.learn(total_timesteps=training_timesteps, tb_log_name=tb_log_name, callback=checkpoint_callback, reset_num_timesteps=True) # Save trained model model.save(save_model_path) env.save(save_vecnormalize_path) else: # Create evaluation environment env_options['has_renderer'] = True register_gripper(UltrasoundProbeGripper) env_gym = GymWrapper(suite.make(env_id, **env_options)) env = DummyVecEnv([lambda: env_gym]) # Load normalized env env = VecNormalize.load(load_vecnormalize_path, env)
from gym.wrappers import FrameStack, FlattenObservation from stable_baselines3 import PPO from top_view_rl_car.sensor_environment import SensorEnvironment from top_view_rl_car.sensor_environment import config # Create environment env = SensorEnvironment(config) env = FlattenObservation(env) env = FrameStack(env, 4) env = FlattenObservation(env) # Instantiate the agent model = PPO('MlpPolicy', env, verbose=1, device="cuda") # Train the agent model.learn(total_timesteps=int(1e5))
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' programing_type = int(sys.argv[1]) environment_name = 'ChromeCrossyRoad-v0' # Start to train the agent if programing_type == 0: env = gym.make(environment_name) model = PPO("MlpPolicy", env, learning_rate=0.0001, gamma=0.7, batch_size=1024, verbose=1, tensorboard_log="./log/ppo_crossy_road_tensorboard/") model.learn(total_timesteps=30000) model.save("../model/ppo") env.close() # Continue to train elif programing_type == 1: myenv = gym.make(environment_name) env = DummyVecEnv([lambda: myenv]) model = PPO.load('../model/ppo', env=env) model.set_env(env) model.learn(total_timesteps=20000, callback=None, reset_num_timesteps=False) model.save("../model/ppo") env.close()
else: model = PPO("MlpPolicy", env_name, learning_rate=1e-3, policy_kwargs=policy_kwargs, tensorboard_log="{}/tensorboard".format(results_root), verbose=1) # Train the agent # Evaluate the model every 1000 steps on 5 test episodes # and save the evaluation to the "logs/" folder # total_timesteps:Number of interactions between agent and environment(one step==one transition); # Each n_steps(2048) contains many episodes; # Then n_steps transitions used to training.(1 epoch == n_steps transitions) model.learn(total_timesteps=100000, eval_freq=1000, n_eval_episodes=5, eval_log_path="./logs/") # save the model model.save("{}/model".format(results_root)) # et policy policy = model.policy # Retrieve the environment env = model.get_env() # Evaluate the policy mean_reward, std_reward = evaluate_policy(policy, env, n_eval_episodes=10, deterministic=True) print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
reward_lb = -0.012 * 4 reward_ub = 0.012 * 2 indicators = [ MidPriceDeltaSign(3), Imbalance(), NormalizedPosition(position_limit) ] env = AbsoluteExchange(files=['AAPL_20170201'], indicators=indicators, reward_lb=reward_lb, reward_ub=reward_ub, start_time=34230000000000, end_time=57540000000000, order_size=order_size, position_limit=position_limit, liquidation_ratio=liquidation_ratio) print('Checking environment') check_env(env) print('Done checking environment') # env = make_vec_env(lambda: env, n_envs=1) model = PPO('MlpPolicy', env, verbose=False) print('\nBegin training') for iteration in range(100): print(f'Iteration: {iteration}') model.learn(3000) evaluate(env, model)
def test_rl(): import gym import datetime as dt import matplotlib.pyplot as plt # from stable_baselines.common.policies import MlpPolicy, CnnPolicy, MlpLstmPolicy, ActorCriticPolicy, LstmPolicy # from stable_baselines.common.vec_env import DummyVecEnv # from stable_baselines import PPO2, PPO1, A2C, DQN, TD3, SAC # from stable_baselines3.common.policies import MlpPolicy from stable_baselines3 import PPO from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3.common.evaluation import evaluate_policy from sklearn import preprocessing import pandas as pd from lutils.stock import LTdxHq ltdxhq = LTdxHq() code = '600519' # 000032 300142 603636 600519 df = ltdxhq.get_k_data_1min(code, end='2021-09-02') # 000032 300142 603636 600519 # df = ltdxhq.get_k_data_daily('603636', end='2019-01-01') # 000032 300142 603636 600519 df = StockDataFrame(df.rename(columns={'vol': 'volume'})) # min_max_scaler = preprocessing.MinMaxScaler() # df = pd.DataFrame(min_max_scaler.fit_transform(df.drop(columns=['date', 'code']))) # df.columns = ['open', 'close', 'high', 'low', 'volume', 'amount'] df_eval = ltdxhq.get_k_data_1min(code, start='2021-09-01') df_eval = StockDataFrame(df_eval.rename(columns={'vol': 'volume'})) ltdxhq.close() # df = ltdxhq.get_k_data_5min('603636') # df = ltdxhq.get_k_data_daily('603636') # df1 = df[:-240] # df2 = df[-240:] # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: LStockDailyEnv(df)]) # model = PPO2(MlpPolicy, env, verbose=1) # , tensorboard_log='log') model = PPO('MlpPolicy', env, verbose=1) # , tensorboard_log='log') model.learn(100000) # model = PPO1(LstmPolicy, env, verbose=1) # model.learn(total_timesteps=1000) # env.set_attr('df', df2) # obs = env.reset() # rewards = [] # actions = [] # net_worths = [] # # for i in range(220): # for i in range(NEXT_OBSERVATION_SIZE, df2.shape[0]): # # actual_obs = observation(df2, i) # # action, _states = model.predict(actual_obs) # # action = [action] # action, _states = model.predict(obs) # obs, reward, done, info = env.step(action) # rewards.append(reward) # actions.append(action[0][0]) # net_worths.append(info[0]['net_worth']) # # print(info[0]['current_step']) # env.render() # mean_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes=1, render=True) # EVAL_EPS # print(mean_reward) model.save('ppo_stock') # model = PPO.load('ppo_stock') eval_env = DummyVecEnv([lambda: LStockDailyEnv(df_eval)]) obs = eval_env.reset() net_worths = [] actions = [] done, state = False, None while not done: action, state = model.predict(obs, state=state, deterministic=True) obs, reward, done, _info = eval_env.step(action) net_worths.append(_info[0]['net_worth']) # if is_recurrent: # obs[0, :] = new_obs # else: # obs = new_obs # if action[0] < Actions.Buy: # Buy # actions.append(1) # elif action[0] < Actions.Sell: # Sell # actions.append(2) # else: # actions.append(0) actions.append(action[0]) eval_env.render() plt.plot(net_worths) plt.plot(actions) plt.show()
eval_env = VecTransposeImage(eval_env) #### Train the model ####################################### # checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=filename+'-logs/', name_prefix='rl_model') callback_on_best = StopTrainingOnRewardThreshold( reward_threshold=EPISODE_REWARD_THRESHOLD, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1, best_model_save_path=filename + '/', log_path=filename + '/', eval_freq=int(2000 / ARGS.cpu), deterministic=True, render=False) model.learn( total_timesteps=35000, #int(1e12), callback=eval_callback, log_interval=100, ) #### Save the model ######################################## model.save(filename + '/success_model.zip') print(filename) #### Print training progression ############################ with np.load(filename + '/evaluations.npz') as data: for j in range(data['timesteps'].shape[0]): print( str(data['timesteps'][j]) + "," + str(data['results'][j][0][0]))
# Control Variables episodes = 15000 test_ratio = 0.25 train_episodes = ceil(episodes * (1 - test_ratio)) test_episodes = floor(episodes * test_ratio) # Init Training Environment train_env = load_environment() # Training Stage print("Start Training Stage") rl = PPO(MlpPolicy, train_env, verbose=1, n_steps=10) train_env.reset() rl.learn(total_timesteps=episodes) rl.save("breakout_model") train_env.close() print("Closed") copyfile('./data.csv', './data_train.csv') multi_output = GamePredictor('rf', single_output=False) single_output = GamePredictor('rf', single_output=True) dataset = read_dataset("data_train.csv") #plot_satisfactions("train", dataset) filtered_dataset = filter_satisfaction(dataset) multi_output.train(filtered_dataset) single_output.train(filtered_dataset)
def main(): # nn = torch.nn.Sequential(torch.nn.Linear(8, 64), torch.nn.Tanh(), # torch.nn.Linear(64, 2)) os.makedirs(_log_dir, exist_ok=True) DoTraining = True StartFresh = True num_cpu = 8 if (DoTraining): # This doesn't work but it might have something to do with how the environment is written # num_cpu = 1 # env = make_vec_env(env_id, n_envs=num_cpu, monitor_dir=_log_dir) # make_vec_env contains Monitor # Create the callback: check every 1000 steps # callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=_log_dir) if (StartFresh): env = SubprocVecEnv([ make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu) ]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch': [128, 128, 128], } model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=2, tensorboard_log=tb_log) else: env = SubprocVecEnv([ make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu) ]) env = VecNormalize.load(_stats_path, env) env.reset() model = PPO.load( 'log\monitor_simpledriving_vecNormalized_128x3_2\PPO_4243456.mdl', tensorboard_log=tb_log) model.set_env(env) eval_env = gym.make(env_id) # print('!!!!Checking Environment!!!!') # print(check_env(eval_env)) mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}') for _ in range(50): model.learn(total_timesteps=100000, tb_log_name=env_id, reset_num_timesteps=False) #, callback=callback mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}') model.save(_log_dir + 'PPO_{}'.format(model.num_timesteps) + '.mdl') env.save(_log_dir + 'vec_normalize_{}'.format(model.num_timesteps) + '.pkl') if (not DoTraining): # eval_env = SubprocVecEnv([make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) # eval_env = VecVideoRecorder(eval_env, video_folder='videos/', # record_video_trigger=lambda step: step == 0, video_length=500, # name_prefix='test') # eval_env.training = False # eval_env.norm_reward = False # eval_env.reset() eval_env = DummyVecEnv( [make_env(env_id, i, log_dir=_log_dir) for i in range(1)]) # eval_env = gym.make(env_id) eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) model = PPO.load( 'log\monitor_simpledriving_vecNormalized_128x3\PPO_5734400.mdl', tensorboard_log=tb_log) model.set_env(eval_env) # record_video(env_id, model, video_length=500, prefix='ppo_'+env_id) # Start the video at step=0 and record 500 steps # eval_env = VecVideoRecorder(eval_env, video_folder='tmp', # record_video_trigger=lambda step: step == 0, video_length=500, # name_prefix='') obs = eval_env.reset() # for i in range(500): # action, _ = model.predict(obs) # obs, _, _, _ = eval_env.step(action) # eval_env.close() while True: action, _states = model.predict(obs, deterministic=True) obs, _, done, _ = eval_env.step(action) # eval_env.render() if done.any(): # obs = eval_env.reset() # time.sleep(1/30) eval_env.close() break
def train(env_function, name="model", n_processes: int = 6, seed: int = 0, load_checkpoint: Optional[str] = None, from_index=0, to_index=12, steps_per_episode=125 * 1000): """ Trains a model with a given environment :param env_function: Function that creates an gym.Env :param name: name for saving :param n_processes: number of processes used for training :param seed: :param load_checkpoint: if None: Create new model. Else: Load model from file :param steps_per_episode: Number of steps for model.learn() :param from_index: starting with this episode (for continuing training later than 0) :param to_index: last index of episode :return: """ def make_env(rank: int): """ Utility function for multiprocessed env. :param rank: index of the subprocess (needed to update seed) """ def _init(): env = env_function() # Important: use a different seed for each environment env.seed(seed + rank) return env return _init # Create the vectorized environment env_vector = SubprocVecEnv([make_env(i) for i in range(n_processes)]) # Create model if load_checkpoint is None: model = PPO( "MlpPolicy", env_vector, tensorboard_log="./ppo_trafficgym_tensorboard/", verbose=2, learning_rate=1e-2, # gamma=0.95, batch_size=256, policy_kwargs=dict(net_arch=[64, 64]), ) else: model = PPO.load(load_checkpoint) # Evaluate before training env = Monitor(env_function()) print("Evaluating...") evaluation = evaluate_policy(model, env) print("Eval1:", evaluation) # Actual training t1 = time.time() for i in range(from_index, to_index + 1): try: model.learn(steps_per_episode) print(f"Save model {i}") model.save(f"{name}{i:02d}.stable_baselines") except KeyboardInterrupt: print("Interrupted by KeyBoard") break t2 = time.time() print(f"Learning took {t2 - t1} seconds") # Evaluate after training print("Evaluating...") evaluation = evaluate_policy(model, env) print("Eval2:", evaluation)
import gym from stable_baselines3 import PPO from servo_env_sim import Servo_Env_Sim #env = gym.make('MountainCarContinuous-v0') env = Servo_Env_Sim() model = PPO('MlpPolicy', env, verbose=1) model.learn(total_timesteps= 50_000) model.save("Model_1") # obs = env.reset() # for i in range(1000): # action, _state = model.predict(obs, deterministic=False) # obs, reward, done, info = env.step(action) # if done: # obs = env.reset()
env = gym.make(env_id) env.seed(seed + rank) return env set_random_seed(seed) return _init if __name__ == '__main__': env_id = "CartPole-v1" num_cpu = 8 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) model = PPO('MlpPolicy', env, verbose=1) model.learn(total_timesteps=2500) model.save("./weights/ppo_cartpole" + str(n)) del model # remove to demonstrate saving and loading model = PPO2.load("./weights/ppo_cartpole" + str(n)) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
body_info = test_body//100 else: body_info = args.test_as_class else: body_info = 0 eval_env = DummyVecEnv([utils.make_env(rank=0, seed=utils.seed+1, wrapper=default_wrapper, render=False, robot_body=test_body, body_info=body_info)]) eval_env = VecNormalize(eval_env, norm_reward=False, **normalize_kwargs) eval_callback = EvalCallback_with_prefix( eval_env=eval_env, prefix=f"{test_body}", n_eval_episodes=3, eval_freq=1e3, # will implicitly multiplied by (train_num_envs) deterministic=True, ) all_callbacks.append(eval_callback) if args.with_checkpoint: checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=f'{folder}/checkpoints/', name_prefix=args.train_bodies) save_vec_callback = SaveVecNormalizeCallback(save_freq=1000, save_path=f"{folder}/checkpoints/", name_prefix=args.train_bodies) all_callbacks.append(checkpoint_callback) all_callbacks.append(save_vec_callback) model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=f"{folder}/tb/{save_filename}-s{utils.seed}", seed=utils.seed, **hyperparams) model.learn(total_timesteps=total_timesteps, callback=all_callbacks) model.save(f"{folder}/{save_filename}") # Important: save the running average, for testing the agent we need that normalization model.get_vec_normalize_env().save(f"{folder}/{save_filename}-vecnormalize.pkl") env.close()
def main(): # multiprocess environment n_cpu = 8 env = SubprocVecEnv( [lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)]) env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) # n_cpu = 1 # env = gym.make('DYROSTocabi-v1') # env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) model = PPO('MlpPolicy', env, verbose=1, n_steps=int(4096 / n_cpu), wandb_use=True) model.learn(total_timesteps=40000000) file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now()) model.save(file_name) env.save(file_name + "_env.pkl") model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) del model # remove to demonstrate saving and loading del env # file_name = "ppo2_DYROSTocabi_2021-02-27 02:20:20.015346" env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize.load(file_name + "_env.pkl", env) env.training = False model = PPO.load(file_name, env=env, wandb_use=False) model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) #Enjoy trained agent obs = np.copy(env.reset()) epi_reward = 0 while True: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render() epi_reward += rewards if dones: print("Episode Reward: ", epi_reward) epi_reward = 0
#### action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(env.N_ACTIONS), sigma=0.1 * np.ones(env.N_ACTIONS), dt=0.005) #### Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) #### Train the model ############################################################################### model = PPO(CustomPolicy, env, verbose=1, batch_size=64) for i in range(step_iters): # run for step_iters * training_timesteps model.learn(total_timesteps=training_timesteps) model.save("./models/ppo" + str((i + 1) * training_timesteps)) # model.save_replay_buffer("./experiences/ppo_experience"+str((i+1)*training_timesteps)) #### Show (and record a video of) the model's performance ########################################## env_test = RLTetherAviary(gui=False, record=True) obs = env_test.reset() start = time.time() for i in range(10 * env_test.SIM_FREQ): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env_test.step(action) if done: break env_test.close() env.close()
) env = make_vec_env(make_configure_env, n_envs=n_cpu, seed=0, vec_env_cls=SubprocVecEnv, env_kwargs=env_kwargs) model = PPO("MlpPolicy", env, n_steps=512 // n_cpu, batch_size=64, learning_rate=2e-3, policy_kwargs=policy_kwargs, verbose=2, tensorboard_log="./highway_attention_ppo/") # Train the agent model.learn(total_timesteps=200 * 1000) # Save the agent model.save("ppo-highway") model = PPO.load("ppo-highwayv0") env = make_configure_env(**env_kwargs) evaluate(env, model) for _ in range(5): obs = env.reset() done = False while not done: action, _ = model.predict(obs) obs, reward, done, info = env.step(action) env.render()
zip = "data/box_flipup_ppo_{observations}.zip" log = "/tmp/ppo_box_flipup/" if __name__ == '__main__': num_cpu = 48 if not args.test else 2 env = make_vec_env("BoxFlipUp-v0", n_envs=num_cpu, seed=0, vec_env_cls=SubprocVecEnv, env_kwargs={ 'observations': observations, 'time_limit': time_limit, }) # env = "BoxFlipUp-v0" if args.test: model = PPO('MlpPolicy', env, n_steps=4, n_epochs=2, batch_size=8) elif os.path.exists(zip): model = PPO.load(zip, env, verbose=1, tensorboard_log=log) else: model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log) new_log = True while True: model.learn(total_timesteps=100000 if not args.test else 4, reset_num_timesteps=new_log) if args.test: break model.save(zip) new_log = False
env = ss.pettingzoo_env_to_vec_env_v0(env) env = ss.concat_vec_envs_v0(env, n_envs, num_cpus=1, base_class='stable_baselines3') env = VecMonitor(env) eval_env = base_env.copy().parallel_env() eval_env = ss.frame_stack_v1(eval_env, 3) eval_env = ss.pettingzoo_env_to_vec_env_v0(eval_env) eval_env = ss.concat_vec_envs_v0(eval_env, 1, num_cpus=1, base_class='stable_baselines3') eval_env = VecMonitor(eval_env) eval_freq = int(n_timesteps / n_evaluations) eval_freq = max(eval_freq // (n_envs*n_agents), 1) model = PPO("MlpPolicy", env, verbose=3, gamma=0.95, n_steps=256, ent_coef=0.0905168, learning_rate=0.00062211, vf_coef=0.042202, max_grad_norm=0.9, gae_lambda=0.99, n_epochs=5, clip_range=0.3, batch_size=256) eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=eval_freq, deterministic=True, render=False) model.learn(total_timesteps=n_timesteps, callback=eval_callback) model = PPO.load("./logs/best_model") mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(mean_reward) print(std_reward) render_env = base_env.copy().parallel_env() render_env = ss.color_reduction_v0(render_env, mode='B') render_env = ss.resize_v0(render_env, x_size=84, y_size=84) render_env = ss.frame_stack_v1(render_env, 3) obs_list = [] i = 0
verbose=1, tensorboard_log=str(common.output_data_folder / f"tensorboard" / saved_model_filename), seed=common.seed, **hyperparams) if len(args.initialize_weights_from) > 0: try: load_model = PPO.load(args.initialize_weights_from) load_weights = load_model.policy.state_dict() model.policy.load_state_dict(load_weights) print(f"Weights loaded from {args.initialize_weights_from}") except Exception: print("Initialize weights error.") raise Exception try: model.learn(total_timesteps=args.train_steps, callback=all_callbacks) except KeyboardInterrupt: pass model.save(str(common.output_data_folder / "models" / saved_model_filename)) if args.vec_normalize: # Important: save the running average, for testing the agent we need that normalization model.get_vec_normalize_env().save( str(common.output_data_folder / "models" / f"{saved_model_filename}.vnorm.pkl")) venv.close()
policy = MlpPolicy model = PPO(policy, env, learning_rate=2.5e-4, n_steps=128, batch_size=32, n_epochs=3, clip_range=0.1, ent_coef=.01, vf_coef=1, #policy_kwargs={'net_arch': [128, 64, 32]}, verbose=1) old_weights_filename = 'ppo-torch-mbool-xstep1-death-perframe+pixdiff-newarch' new_weights_filename = 'ppo-torch-mbool-xstep1-death-perframe+pixdiff-newarch' if args.mode == 'train': callbacks = [ CheckpointCallback(500000, save_path=f'./checkpoint_weights/{new_weights_filename}/', name_prefix=new_weights_filename), ] # model = PPO.load(old_weights_filename, env=env) model.learn(10000000, callback=callbacks, log_interval=5, tb_log_name=new_weights_filename) model.save(new_weights_filename) elif args.mode == 'test': import logging model = PPO.load(old_weights_filename) #, env=env) obs = env.reset() testlog = logging.getLogger('testing') testlog.setLevel(logging.DEBUG) fh = logging.FileHandler('./test.log') testlog.addHandler(fh) ch = logging.StreamHandler() testlog.addHandler(ch)
env.add_car(car) # Uncomment this if you've made any changes to the environment and want to make # sure that everything is still okay (no output means everything is fine): # check_env(env) # Uncomment one of the following depending on what you'd like to do # A. Use an existing model # model = PPO.load(model_dir + model_name) # B. Create and train a new model timesteps = 10000 model = PPO('MlpPolicy', env, tensorboard_log="./ppo/", verbose=1) model.learn(total_timesteps=timesteps, callback=TensorboardCallback()) model.save(model_dir + model_name) # Reset the env env = Track() car = Car() env.add_car(car) obs = env.reset(new=args.ifreset) # You can set new=True if you'd like to create a new track # Run the simulation until the car crashes or finishes done = False while not done: action, _states = model.predict(obs)
def main(): set_random_seed(RANDOM_SEED) t_start = time() name = "LargeFinalLayer" checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name) os.makedirs(checkpoint_path, exist_ok=True) log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name) os.makedirs(log_path, exist_ok=True) results_path = os.path.join(checkpoint_path, "results.json") env_args = dict( frame_skip=4, screen_size=84, terminal_on_life_loss=True, clip_reward=True, ) # Creates a gym environment for an atari game using the specified seed and number of environments # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors # for improved performance.. # train_env = make_atari_env(ENV_NAME, n_envs=N_ENVS, seed=RANDOM_SEED, wrapper_kwargs=env_args) def atari_wrapper(env: gym.Env) -> gym.Env: env = AtariWrapper(env, **env_args) return env def make_env(rank: int, count: int) -> VecEnv: return make_vec_env( ENV_NAME, n_envs=count, seed=RANDOM_SEED + rank, start_index=0, monitor_dir=None, wrapper_class=atari_wrapper, env_kwargs=None, vec_env_cls=None, vec_env_kwargs=None, monitor_kwargs=None, ) train_env = make_env(0, N_ENVS) eval_env = make_env(1, 1) # required by models in baselines train_env = VecTransposeImage(train_env) eval_env = VecTransposeImage(eval_env) # setup callback to save model at fixed intervals save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ, save_path=checkpoint_path, name_prefix=name) stop_callback = StopTrainingOnRewardThreshold( reward_threshold=EVAL_THRESHOLD) time_callback = TimeLimitCallback(max_time=TIME_LIMIT) best_callback = EvalCallback( eval_env, eval_freq=EVAL_FREQ, best_model_save_path=checkpoint_path, callback_on_new_best=stop_callback, ) list_callback = CallbackList([save_callback, best_callback, time_callback]) model = PPO( CnnPolicy, train_env, verbose=VERBOSE, batch_size=BATCH_SIZE, seed=RANDOM_SEED, tensorboard_log=log_path, learning_rate=LEARNING_RATE, n_steps=UPDATE_STEPS, n_epochs=N_EPOCHS, ent_coef=ENT_COEF, vf_coef=VF_COEF, clip_range=CLIP_RANGE, device=DEVICE_TYPE, policy_kwargs=dict(features_extractor_class=FeatureExtractor), ) config_path = os.path.join(checkpoint_path, "cnn_config") zip_path = os.path.join(checkpoint_path, "model.zip") # output the model config to a file for easier viewing with open(config_path, "w") as file: file.write(f"{name}\n") file.write(str(model.policy.features_extractor.cnn)) print("Beginning training...") model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run") # model.learn(TRAIN_STEPS, tb_log_name="run") model.save(zip_path) del train_env # del eval_env time_taken = time() - t_start print("Beginning evaluation...") # score of the game, standard deviation of multiple runs reward_mean, reward_std = evaluate_policy(model, make_env(2, 1)) with open(results_path, "w") as handle: handle.write(json.dumps((reward_mean, reward_std, time_taken)))
opt_reward, std_reward = evaluate_policy(opt, env, n_eval_episodes=100) mean_reward = mean_reward / opt_reward std_reward = std_reward / opt_reward leaderboard("MSY", ENV, mean_reward, std_reward, url) print("algo:", "MSY", "env:", ENV, "mean reward:", mean_reward, "std:", std_reward) ## PPO ###################################################################### # load best tuned parameters... model = PPO('MlpPolicy', vec_env, verbose=0, tensorboard_log=tensorboard_log, seed=seed) model.learn(total_timesteps=300000) mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100) # Rescale score against optimum solution in this environment opt = escapement(env) opt_reward, std_reward = evaluate_policy(opt, env, n_eval_episodes=100) mean_reward = mean_reward / opt_reward std_reward = std_reward / opt_reward leaderboard("PPO", ENV, mean_reward, std_reward, url) print("algo:", "PPO", "env:", ENV, "mean reward:", mean_reward, "std:", std_reward) ## simulate and plot results df = env.simulate(model, reps=10) env.plot(df, "results/ppo.png") policy = env.policyfn(model, reps=10)
return observation if __name__ == "__main__": from stable_baselines3 import PPO, DQN import os import time model_name = f"snake_{int(time.time())}" models_dir = f"models/{model_name}/" logdir = f"logs/{model_name}/" if not os.path.exists(models_dir): os.makedirs(models_dir) if not os.path.exists(logdir): os.makedirs(logdir) env = SnekEnv() env.reset() model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=logdir) # model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=logdir) TIMESTEPS = 10000 while True: model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=f"PPO") model.save(f"{models_dir}/{TIMESTEPS}") cv2.destroyAllWindows()
lambd.append(env.get_lambd()) N.append(env.get_N()) #print (env.get_lambd(), env.get_N()) with open(f"./{args.folder}/buffers/lambda.npy", "wb") as fp: pickle.dump(lambd, fp) with open(f"./{args.folder}/buffers/N.npy", "wb") as fp: pickle.dump(N, fp) # model.learn(total_timesteps=5000, log_interval=10, # callback=callback, reset_num_timesteps=False) else: env.set_N(int(N[i]), list(lambd[i])) #print ("Lambda, N", N[i], lambd[i]) if args.algo != 3 and args.algo != 4: model.learn(total_timesteps=args.eval_freq, log_interval=10, reset_num_timesteps=False) model_name = f"./{args.folder}/models/model_{args.algo}_{j}_{i}" model.save(model_name) # np.save(f"./{args.folder}/buffers/lambda_{args.algo}_{j}.npy", lambd) # np.save(f"./{args.folder}/buffers/N_{args.algo}_{j}.npy", N) if args.algo == 0: model = PPO.load(model_name, env) elif args.algo == 1: model = A2C.load(model_name, env) elif args.algo == 2: model = SAC.load(model_name, env) elif args.algo == 3: state = train_salmut(env, model, args.eval_freq, args, state, j) #parameters = atari_parameters if is_atari else regular_parameters
model = PPO( "GnnPolicy", env, # reducing batch_size to 1 n_steps=1024, verbose=1, tensorboard_log="runs", batch_size=32, learning_rate=1e-3, gamma=0.99, gae_lambda=0.95, clip_range=0.2, vf_coef=0.5, policy_kwargs={ 'mlp_extractor_kwargs': { 'task_name': task_name, 'xml_assets_path': None } }, ) mean_reward_before_train = evaluate(model, num_episodes=4) model.learn(total_timesteps=2000000, tb_log_name='{}_{}'.format( task_name, datetime.now().strftime('%d-%m_%H-%M-%S'))) model.save("a2c_ant") mean_reward = evaluate(model, num_episodes=4) print(mean_reward_before_train) print(mean_reward)