def run_experiment(verbose, tensorboard_log, learning_rate): pdb.set_trace() env = make_vec_env( 'PointMassDense-%d-v1' % num_objs, 1, wrapper_class=FlattenDictWrapper, wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal']) env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) n_actions = env.action_space.shape[-1] stddev = 0.2 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=learning_rate, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) model.save(expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(learning_rate))) env.close()
def main(): retro.data.Integrations.add_custom_path( os.path.join(SCRIPT_DIR, "custom_integrations")) print("PokemonRed-GameBoy" in retro.data.list_games( inttype=retro.data.Integrations.ALL)) env = retro.make("PokemonRed-GameBoy", inttype=retro.data.Integrations.ALL) print(env) print(env.action_space) time.sleep(3) env = make_vec_env(lambda: env, n_envs=1) # check_env(env, warn=True) time.sleep(3) model = DQN(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() env.close()
def run_train(self): env = CustomEnv(self.path_planner, self.behavior_planner, event) env = make_vec_env(lambda: env, n_envs=1) model = None if self.event == Scenario.LANE_CHANGE: model = DQN(CustomLaneChangePolicy, env, verbose=1, learning_starts=256, batch_size=256, exploration_fraction=0.9, target_network_update_freq=100, tensorboard_log=dir_path + '/Logs/') if self.event == Scenario.PEDESTRIAN: model = DQN(CustomPedestrianPolicy, env, verbose=1, learning_starts=256, batch_size=256, exploration_fraction=0.9, target_network_update_freq=100, tensorboard_log=dir_path + '/Logs/Ped', gamma=0.93, learning_rate=0.0001) model.learn(total_timesteps=20000) model.save(MODEL_SAVE_PATH)
def run_test(self): env = CustomEnv(self.path_planner, self.behavior_planner, event) env = make_vec_env(lambda: env, n_envs=1) if (self.event == Scenario.LANE_CHANGE): model = DQN.load(MODEL_LOAD_PATH) if (self.event == Scenario.PEDESTRIAN): model = DQN.load(MODEL_LOAD_PATH) obs = env.reset() count = 0 success = 0 while count < 500: done = False while not done: action, _ = model.predict(obs) print("Action taken:", RLDecision(action)) obs, reward, done, info = env.step(action) # print("Reward",reward) count += 1 if info[0]["success"]: success += 1 print("Count ", count, "Success ", success, "Success Rate:", success * 100 / float(count), "%") print("Success Rate ", success / count, success, count)
def model_free_policy(self, ne, n_epochs=1, train=True, load_model=False): if self.autoencoder is None: self.setup_autoencoder(ne.get_obs()) assert (self.autoencoder) is not None if ne.autoencoder is None: ne.set_autoencoder(self.autoencode) ne.autoencoder = self.autoencode if train: fn = "models/model1.h5" self.mf_policy = PPO2(env=ne, policy=MlpPolicy, n_steps=40, verbose=2, noptepochs=10, learning_rate=3e-4, ent_coef=0.1, gamma=0.1) if load_model: self.mf_policy.load(fn, env=make_vec_env(lambda: ne)) else: self.mf_policy.learn(total_timesteps=n_epochs * 40) self.mf_policy.save(fn) encoded_obs = ne.rl_obs() return self.mf_policy.step([encoded_obs], deterministic=True)[0].flatten()
def _load(model_name): model = PPO2.load(model_name) env = make_vec_env('PointMassDense-%d-v1' %num_objs, 1, wrapper_class = FlattenDictWrapper, wrapper_env_kwargs =['observation', 'achieved_goal', 'desired_goal']) env = VecVideoRecorder(env, osp.join(logger, "videos_3"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) model.set_env(env) model.learn(total_timesteps=int(nIter), log_interval=100) # model.save(exp_name) model.save(model_name + "_new") env.close()
def train(): set_gpu() expDir = '/home/shivanik/lab/pointExp/state/' num_objs = 1 verbose = 1 name = 'sac_%d_0.5' % num_objs nIter = 1e8 save_video_length = 200 save_video_interval = 1000000 file = open('sac_done.txt', 'w+') env = make_vec_env( 'PointMassDense-%d-v1' % num_objs, 1, wrapper_class=FlattenDictWrapper, wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal']) n_actions = env.action_space.shape[-1] stddev = 0.2 pool = multiprocessing.Pool(processes=4) for lr in [1e-5]: #, 5e-4, 1e-5 logger = osp.join( expDir, name, 'logs%s_%s' % (np.format_float_scientific(nIter), np.format_float_scientific(lr))) env = VecVideoRecorder( env, osp.join(logger, "videos"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # boo = pool.apply_async(func_run, args=(env, logger, lr, action_noise, file)) model = SAC( MlpPolicy, env, verbose=verbose, tensorboard_log=logger, learning_rate=lr, action_noise=action_noise, ) model.learn(total_timesteps=int(nIter), log_interval=100) exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter), np.format_float_scientific(lr)) model.save(exp_name) file.write(exp_name + '\n') env.close() file.close() pool.close() pool.join()
def main(output_folder_path: Path): env = make_vec_env('roar-occu-map-e2e-v0') env.reset() model_params: dict = { "verbose": 1, "env": env, "n_steps": 100 # "render": True, } model, callbacks = setup(model_params, output_folder_path) model = model.learn(total_timesteps=1e6, callback=callbacks, reset_num_timesteps=False)
def test_custom_vec_env(): """ Stand alone test for a special case (passing a custom VecEnv class) to avoid doubling the number of tests. """ monitor_dir = 'logs/test_make_vec_env/' env = make_vec_env('CartPole-v1', n_envs=1, monitor_dir=monitor_dir, seed=0, vec_env_cls=SubprocVecEnv, vec_env_kwargs={'start_method': None}) assert env.num_envs == 1 assert isinstance(env, SubprocVecEnv) assert os.path.isdir('logs/test_make_vec_env/') # Kill subprocess env.close() # Cleanup folder shutil.rmtree(monitor_dir) # This should fail because DummyVecEnv does not have any keyword argument with pytest.raises(TypeError): make_vec_env('CartPole-v1', n_envs=1, vec_env_kwargs={'dummy': False})
def record(exp): model = SAC.load(exp) env = make_vec_env( 'PointMassDense-%d-v1' % num_objs, 1, wrapper_class=FlattenDictWrapper, wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal']) env = VecVideoRecorder( env, osp.join(logger, "videos_2"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) model.set_env(env) model.learn(total_timesteps=2000, log_interval=100) # model.save(expDir + "/%s/%d" %(name, nIter)) env.close()
def mk_env_agent(env_class, registered_model, params, gui=False): model = SAC.load(registered_model.source) params_fname = f'{registered_model.source}.json' # FIXME with open(params_fname, 'r') as fp: loaded_params = json.load(fp) params = {**loaded_params, **params} # merge, overriding loaded params env = make_vec_env(lambda: env_class(params['NJ'], params, gui=gui), n_envs=1) model.set_env(env) env.env_method('set_render_info', { 'name': registered_model.name, 'version': registered_model.version }) # FIXME return env, model
def train_stable_baselines(args): """ Trains with PPO2 on specified environment. Parameters: args - the arguments defined in main. Return: None """ # Import stable baselines from stable_baselines import PPO2 from stable_baselines.common.callbacks import CheckpointCallback from stable_baselines.common.cmd_util import make_vec_env from stable_baselines.common.evaluation import evaluate_policy # Store hyperparameters and total timesteps to run by environment hyperparameters = {} total_timesteps = 0 if args.env == 'Pendulum-v0': hyperparameters = {'n_steps': 2048, 'nminibatches': 32, 'lam': 0.95, 'gamma': 0.99, 'noptepochs': 10, 'ent_coef': 0.0, 'learning_rate': 3e-4, 'cliprange': 0.2, 'verbose': 1, 'seed': args.seed} total_timesteps = 1005000 elif args.env == 'BipedalWalker-v3': hyperparameters = {'n_steps': 2048, 'nminibatches': 32, 'lam': 0.95, 'gamma': 0.99, 'noptepochs': 10, 'ent_coef': 0.001, 'learning_rate': 2.5e-4, 'cliprange': 0.2, 'verbose': 1, 'seed': args.seed} total_timesteps = 1405000 elif args.env == 'LunarLanderContinuous-v2': hyperparameters = {'n_steps': 1024, 'nminibatches': 32, 'lam': 0.98, 'gamma': 0.999, 'noptepochs': 4, 'ent_coef': 0.01, 'cliprange': 0.2, 'verbose': 1, 'seed': args.seed} total_timesteps = 1005000 elif args.env == 'MountainCarContinuous-v0': hyperparameters = {'n_steps': 256, 'nminibatches': 8, 'lam': 0.94, 'gamma': 0.99, 'noptepochs': 4, 'ent_coef': 0.0, 'cliprange': 0.2, 'verbose': 1, 'seed': args.seed} total_timesteps = 405000 # Create log dir log_dir = "/tmp/gym/" os.makedirs(log_dir, exist_ok=True) # Make the environment and model, and train env = make_vec_env(args.env, n_envs=1, monitor_dir=log_dir) model = PPO2('MlpPolicy', env, **hyperparameters) model.learn(total_timesteps)
def sb_model_train(rl_manager): env = CustomEnv(rl_manager) env = make_vec_env(lambda: env, n_envs=1) model = DQN(CustomPolicy, env, verbose=1, learning_starts=256, batch_size=256, exploration_fraction=0.5, target_network_update_freq=10, tensorboard_log='./Logs/') # model = DQN(MlpPolicy, env, verbose=1, learning_starts=64, target_network_update_freq=50, tensorboard_log='./Logs/') # model = DQN.load("DQN_Model_SimpleSim_30k",env=env,exploration_fraction=0.1,tensorboard_log='./Logs/') model.learn(total_timesteps=10000) # model = PPO2(MlpPolicy, env, verbose=1,tensorboard_log="./Logs/") # model.learn(total_timesteps=20000) model.save(dir_path + "/DQN_Model_SimpleSim") # sb_model_test(rl_manager) return
def learn(): # expDir = '/home/shivanik/lab/pointExp/state/' # verbose = 1 # num_objs = 1 # name = 'ppo2_%d' %num_objs # logger = osp.join(expDir, name, 'logs') # video_folder = osp.join(logger, 'videos') # nIter = 1e7 # save_video_interval = 5000 env = make_vec_env('PointMassDense-%d-v1' %num_objs, 1, wrapper_class = FlattenDictWrapper, wrapper_env_kwargs =['observation', 'achieved_goal', 'desired_goal']) env = VecVideoRecorder(env, video_folder, record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length, name_prefix="Video-{}") model = PPO2(MlpPolicy, env, verbose=verbose, tensorboard_log=logger,) model.learn(total_timesteps=int(nIter)) model.save(expDir + "/%s/%s" %(name, np.format_float_scientific(nIter)))
def main(): retro.data.Integrations.add_custom_path( os.path.join(SCRIPT_DIR, "custom_integrations")) print("PokemonRed-GameBoy" in retro.data.list_games( inttype=retro.data.Integrations.ALL)) env = retro.make("PokemonRed-GameBoy", inttype=retro.data.Integrations.ALL, obs_type=retro.Observations.RAM, use_restricted_actions=retro.Actions.DISCRETE ) #, use_restricted_actions=retro.Actions.DISCRETE print(env) # print(env.action_space) vec_env = make_vec_env(lambda: env, n_envs=4) # time.sleep(3) model = A2C(MlpPolicy, vec_env, verbose=1) start_time = time.time() model.learn(total_timesteps=200000) print("TRAINING COMPLETE! Time elapsed: ", str(time.time() - start_time)) print("Attempting to get first pokemon!") start_time = time.time() printed_done = False # sampled_info = False obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() # if not sampled_info: # print("Info:\n", info, "\n</info>") # sampled_info = True if dones and not printed_done: print("Success! time elapsed: ", str(time.time() - start_time)) printed_done = True env.close()
def create_env(n_envs, eval_env=False): if algo in ['a2c', 'acer', 'acktr', 'ppo2']: if n_envs > 1: env = SubprocVecEnv([ make_env(env_id, i, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) else: env = DummyVecEnv([ make_env(env_id, 0, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) env = DummyVecEnv([lambda: gym.make(env_id, **env_kwargs)]) if env_wrapper is not None: env = env_wrapper(env) elif ((algo in ['dqn', 'her', 'sac', 'td3']) and n_envs > 1): raise ValueError( "Error: {} does not support multiprocessing!".format(algo)) elif ((algo in ['ddpg', 'ppo1', 'trpo', 'gail']) and n_envs > 1): raise ValueError( "Error: {} uses MPI for multiprocessing!".format(algo)) else: env = make_vec_env(env_id, n_envs=n_envs, seed=args.seed, monitor_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) if args.normalize: # choose from multiple options # env = VecNormalize(env, clip_obs=np.inf) env = VecNormalize(env, norm_reward=False, clip_obs=np.inf) # env = VecNormalize(env, norm_reward=False, clip_obs=np.inf, **normalize_kwargs) return env
def test_make_vec_env(env_id, n_envs, vec_env_cls, wrapper_class): env = make_vec_env(env_id, n_envs, vec_env_cls=vec_env_cls, wrapper_class=wrapper_class, monitor_dir=None, seed=0) assert env.num_envs == n_envs if vec_env_cls is None: assert isinstance(env, DummyVecEnv) if wrapper_class is not None: assert isinstance(env.envs[0], wrapper_class) else: assert isinstance(env.envs[0], Monitor) else: assert isinstance(env, SubprocVecEnv) # Kill subprocesses env.close()
def sb_model_test(rl_manager): env = CustomEnv(rl_manager) env = make_vec_env(lambda: env, n_envs=1) model = DQN.load(dir_path + "/DQN_Model_SimpleSim_30k") obs = env.reset() count = 0 success = 0 while count < 100: done = False print("Count ", count, "Success ", success) while not done: action, _ = model.predict(obs) print(action) obs, reward, done, info = env.step(action) count += 1 if reward == 5: success += 1 print("Success Rate ", success / count, success, count) rl_manager.finish = True
def recieve(sid, data): global done global reward global maxactions jsonInput = json.loads(data) maxactions = jsonInput['maxactions'] trainepisodes = jsonInput['trainepisodes'] evalepisodes = jsonInput['evalepisodes'] totalepisodes = trainepisodes + evalepisodes env = UnrealEnvWrap() # wrap it env = make_vec_env(lambda: env, n_envs=1) # Train the agent with different algorityhms from stable baselines #model = DQN(MlpPolicy, env, verbose=1, tensorboard_log="./DQN_newobservations/") model = DQN(MlpPolicy, env, verbose=1) #model = A2C(MlpPolicy, env, verbose=1, tensorboard_log="./A2C_newobservations/") #model = A2C(MlpPolicy, env, verbose=1) print("Agent training in process...") model.learn(total_timesteps=trainepisodes) # Test the trained agent, (currently not needed, all testing occurs in Unreal itself) env.render(mode='console') #env.render() obs = env.reset() print("Training complete, Starting Evaluation of Trained Model:") intaction = 0 #Begin strategic behvaior for step in range(evalepisodes): action, _ = model.predict(obs, deterministic=True) intaction = action[0] print("Action: ", intaction) obs, reward, done, info = env.step(action) print('obs=', obs, 'reward=', reward, 'done=', done) sio.disconnect(sid)
def train(agent=None): weights = {'fr': 0.3, 'fl': 20, 'fk': 20} depth, width, move_dist, plan_dist = 3, 3, 3, 3 mkenv = lambda: Env(depth, width, move_dist, plan_dist, max_steps=20, weights=weights, obstacle_pct=0.1) eval_callback = EvalCallback(mkenv(), best_model_save_path='logs/models', log_path='logs', eval_freq=1_000, deterministic=True, render=False) vecenv = make_vec_env(mkenv, 32, monitor_dir='logs/training') if agent: agent.set_env(vecenv) else: hparams = dict(n_steps=64, nminibatches=64, gamma=0.90, learning_rate=2e-5, ent_coef=0.01, cliprange=0.4, noptepochs=25, lam=0.99) agent = PPO2('MlpPolicy', vecenv, verbose=True, **hparams) agent.learn(1_000_000, callback=eval_callback) agent.save('logs/models/final') vecenv.close() return agent
#! /usr/bin/env python import gym gym.logger.set_level(40) import tensorflow as tf tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) import env_yaw from stable_baselines import DQN, PPO2, A2C, ACKTR from stable_baselines.common.cmd_util import make_vec_env from stable_baselines.common.evaluation import evaluate_policy env = gym.make("Yaw-v0") env = make_vec_env(lambda: env, n_envs=1) # model = ACKTR.load("models/acktr_goleft", env=env) model = ACKTR('MlpPolicy', env, verbose=1) mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100) print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
import os os.environ["CUDA_VISIBLE_DEVICES"] = "-1" #import tensorflow as tf from stable_baselines import PPO2, logger from stable_baselines.common.cmd_util import make_vec_env from stable_baselines.common.vec_env import VecFrameStack from stable_baselines.common.policies import MlpPolicy, CnnLstmPolicy, CnnLnLstmPolicy import gym # from stable_baselines.ppo2.cppo2 import CPPO2 import numpy as np env = make_vec_env('CartPole-v1', 1, 17) model_random = PPO2(MlpPolicy, env, verbose=1, cliprange=0.1, seed=17, n_cpu_tf_sess=1) #, previous_model="previous.zip") model_random.save("random") #model = CPPO2(MlpPolicy, env, verbose=1, previous_model="random.zip", cliprange=0.01) #model = model_random #model = PPO2.load("random.zip", env=env) model = CPPO2.load("random.zip", previous_model_path="random.zip", env=env) model.learn(total_timesteps=4000) # model.save("continuous_from_random") sum_rewards = 0 episode_rewards = [] obs = env.reset()
def __init__(self, algorithm="SAC", load=True, agent_name="Agent001"): self.agent_name = agent_name #self.env = LearningRocket(visualize=False) #self.env = NormalizeActionWrapper(self.env) #self.eval_env = LearningRocket(visualize=True) #self.eval_env = NormalizeActionWrapper(self.eval_env) #self.env = SubprocVecEnv([lambda: LearningRocket(visualize=False) for i in range(4)]) self.env = make_vec_env( LearningRocket, n_envs=16 ) #[lambda: LearningRocket(visualize=False) for i in range(16)])) #self.eval_env = VecNormalize(DummyVecEnv([lambda: LearningRocket(visualize=True) for i in range(1)])) self.eval_env = make_vec_env(lambda: LearningRocket(visualize=True), n_envs=1) #self.eval_env = VecNormalize(self.eval_env) self.eval_callback = EvalCallback(self.eval_env, best_model_save_path='Agent007', log_path='./logs/', eval_freq=10000, deterministic=True, render=False, n_eval_episodes=1) kai_policy = dict(act_fun=tf.nn.tanh, net_arch=[400, 300]) #check_env(self.env, warn=True) """ if algorithm == "SAC": if load is True: self.model = SAC.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") #self.model.ent_coef=0.2 else: self.model = SAC('MlpPolicy', self.env, verbose=1, tensorboard_log="./rocket_tensorboard/",ent_coef=5) print("Trainer Set for SAC") """ if algorithm == "TD3": n_actions = self.env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) if load is True: self.model = TD3.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") #file = open('replay_buffer', 'rb') #self.model.replay_buffer = pickle.load(file) #file.close() else: self.model = TD3(MlpPolicy, self.env, action_noise=action_noise, batch_size=768, gamma=0.95, learning_rate=1e-4, learning_starts=20000, verbose=1, tensorboard_log="./rocket_tensorboard/", policy_kwargs=dict(layers=[400, 300])) print("Trainer Set for TD3") elif algorithm == "PPO2": if load is True: self.model = PPO2.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") self.eval_env = VecNormalize.load(self.agent_name + "vEnv", self.eval_env) #self.eval_env.clip_obs = 500 #self.env = VecNormalize(self.env) self.env = VecNormalize.load(self.agent_name + "vEnv", self.env) #self.env.clip_obs = 500 #self.env.norm_obs = False #self.eval_env.norm_obs = False else: self.model = PPO2(PPOMlpPolicy, self.env, n_steps=1024, nminibatches=32, lam=0.98, gamma=0.999, noptepochs=4, ent_coef=0.01, verbose=1, tensorboard_log="./rocket_tensorboard/", policy_kwargs=dict(layers=[400, 300])) self.eval_env = VecNormalize(self.eval_env) self.env = VecNormalize(self.env) #self.eval_env.clip_obs = 500 #self.env.clip_obs = 500 #self.env.norm_obs=False #self.eval_env.norm_obs=False print("Trainer set for PPO2. I am speed.")
import gym from gym.spaces import MultiDiscrete from stable_baselines import PPO2, A2C from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv from stable_baselines.common.cmd_util import make_vec_env from stable_baselines.common.callbacks import BaseCallback, CheckpointCallback, EveryNTimesteps from curiosity_mask.util import create_dummy_action_mask as mask from curiosity_mask.util import set_action_mask_gait as gait_mask from curiosity_mask.ui import UI from transitions import Machine #env = gym.make('Acrobot-v1') env = make_vec_env('Acrobot-v1', n_envs=40) class Balance(object): def __init__(self): self.action_mask = [] self.num_timesteps = None def is_sufficient_torque(self, event): #return False # abs(event.kwargs.get('torque')) > 9.8 #if math.cos(event.kwargs.get('angle_1')) < -0.7: # Torso link is lifted high # sys.exit() return math.cos(event.kwargs.get('angle_1')) < -0.7 def back_to_swing(self, event): return False
# for plotting results: from stable_baselines import results_plotter from stable_baselines.bench import Monitor from stable_baselines.results_plotter import load_results # creating environment env = gym.make("Witches-v0") log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = Monitor(env, log_dir) # wrap it env_vec = make_vec_env(lambda: env, n_envs=60) # If the environment don't follow the interface, an error will be thrown #check_env(env, warn=True) model = PPO2('MlpLstmPolicy', env_vec, verbose=1) time_steps = 1e8 model.learn(int(time_steps)) # export model as onnx: #1. export params #2. load params in pytorch: model.save("tmp/witches")
def __init__(self): rospy.init_node('train_node', anonymous=True) env = gym.make("Yaw-v0") env = make_vec_env(lambda: env, n_envs=1) model = DQN('MlpPolicy', env, verbose=1).learn(1000)
ALGO = PPO2 # We will create one environment to evaluate the agent on eval_env = gym.make(env_id) # DummyVecEnv vs SubprocVecEnv reward_averages = [] reward_std = [] training_times = [] total_procs = 0 for n_procs in PROCESSES_TO_TEST: total_procs += n_procs print('Running for n_procs = {}'.format(n_procs)) # Here we are using only one process even for n_env > 1 # this is equivalent to DummyVecEnv([make_env(env_id, i + total_procs) for i in range(n_procs)]) train_env = make_vec_env(env_id, n_envs=n_procs) rewards = [] times = [] for experiment in range(NUM_EXPERIMENTS): # it is recommended to run several experiments due to variability in results train_env.reset() model = ALGO('MlpPolicy', train_env, verbose=0) start = time.time() model.learn(total_timesteps=TRAIN_STEPS) times.append(time.time() - start) mean_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes=EVAL_EPS) rewards.append(mean_reward)
import time import gym import numpy as np from stable_baselines import ACKTR from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv from stable_baselines.common import set_global_seeds from stable_baselines.common.evaluation import evaluate_policy from stable_baselines.common.cmd_util import make_vec_env # By default, we use a DummyVecEnv as it is usually faster (cf doc) env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use vec_env = make_vec_env(env_id, n_envs=num_cpu) model = ACKTR('MlpPolicy', vec_env, verbose=0)
) env.reset() model = PPO2.load('../data/pretrained_models/controlTableLine/PPO') for _ in range(video_length + 1): action, _ = model.predict(obs, deterministic=True) obs, _, _, _ = env.step(action) env.close() elif mode == 'gif': import imageio from stable_baselines.common.cmd_util import make_vec_env images = [] env = make_vec_env(controlTableLine, n_envs=1) model = PPO2.load('../data/pretrained_models/controlTableLine/PPO', env) obs = model.env.reset() img = model.env.render(mode='rgb_array') for i in range(1200): images.append(img) action, _ = model.predict(obs, deterministic=True) obs, _, _, _ = model.env.step(action) img = model.env.render(mode='rgb_array') imageio.mimsave( '../data/videos/PPO_controlTableLine.gif', [np.array(img) for i, img in enumerate(images) if i % 2 == 0], fps=29) else: show_env = controlTableLine()
def train(env_id, num_timesteps, seed, policy, n_envs=8, nminibatches=4, n_steps=128, peer=0., scheduler=None, individual=False, repeat=1): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param n_envs: (int) Number of parallel environments :param nminibatches: (int) Number of training minibatches per update. For recurrent policies, the number of environments run in parallel should be a multiple of nminibatches. :param n_steps: (int) The number of steps to run for each environment per update (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) """ policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] is_atari = 'NoFrameskip' in env_id make_env = lambda: VecFrameStack(make_atari_env(env_id, n_envs, seed), 4) if is_atari \ else make_vec_env(env_id, n_envs, seed) print(make_env) models = { "A": PPO2( policy=policy, policy_kwargs={'view': 'even'}, n_steps=n_steps, env=make_env(), nminibatches=nminibatches, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=2.5e-4, cliprange=lambda f: f * 0.1, verbose=1), "B": PPO2( policy=policy, policy_kwargs={'view': 'odd'}, n_steps=n_steps, env=make_env(), nminibatches=nminibatches, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=2.5e-4, cliprange=lambda f: f * 0.1, verbose=1)} views = {view: View(models[view], peer=peer) for view in ("A", "B")} n_batch = n_envs * n_steps n_updates = num_timesteps // n_batch for t in range(n_updates): logger.info("current episode:", t) for view in "A", "B": models[view].learn(n_batch) if not individual: for view, other_view in zip(("A", "B"), ("B", "A")): obses, _, _, actions, _, _, _, _, _ = models[other_view].rollout views[view].peer = peer * scheduler(t) logger.info("current alpha:", views[view].peer) for _ in range(repeat): views[view].learn( obses, actions, views[view].learning_rate / repeat) for view in "A", "B": models[view].env.close() del models[view] # free memory