def test_custom_vec_env(tmp_path): """ Stand alone test for a special case (passing a custom VecEnv class) to avoid doubling the number of tests. """ monitor_dir = tmp_path / "test_make_vec_env/" env = make_vec_env( "CartPole-v1", n_envs=1, monitor_dir=monitor_dir, seed=0, vec_env_cls=SubprocVecEnv, vec_env_kwargs={"start_method": None}, ) assert env.num_envs == 1 assert isinstance(env, SubprocVecEnv) assert os.path.isdir(monitor_dir) # Kill subprocess env.close() # Cleanup folder shutil.rmtree(monitor_dir) # This should fail because DummyVecEnv does not have any keyword argument with pytest.raises(TypeError): make_vec_env("CartPole-v1", n_envs=1, vec_env_kwargs={"dummy": False})
def _ppo_training(cls, env_name: str, env_kwargs: Dict[str, Any], agent_kwargs: Dict[str, Any]) -> bool: """ Run PPO algorithm on a given algorithm and check if the reward threshold has been exceeded. """ # Create a multiprocess environment train_env = make_vec_env(env_id=env_name, env_kwargs=env_kwargs, n_envs=int(N_THREADS // 2), vec_env_cls=SubprocVecEnv, seed=SEED) test_env = make_vec_env(env_id=env_name, env_kwargs=env_kwargs, n_envs=1, vec_env_cls=DummyVecEnv, seed=SEED) # Create the learning agent according to the chosen algorithm config = cls._get_default_config_stable_baselines() config.update(agent_kwargs) train_agent = PPO('MlpPolicy', train_env, **config, verbose=False) train_agent.eval_env = test_env # Run the learning process return train(train_agent, max_timesteps=150000)
def test_vec_env_monitor_kwargs(): env = make_vec_env("MountainCarContinuous-v0", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": False}) assert env.get_attr("allow_early_resets")[0] is False env = make_atari_env("BreakoutNoFrameskip-v4", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": False}) assert env.get_attr("allow_early_resets")[0] is False env = make_vec_env("MountainCarContinuous-v0", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": True}) assert env.get_attr("allow_early_resets")[0] is True env = make_atari_env( "BreakoutNoFrameskip-v4", n_envs=1, seed=0, monitor_kwargs={"allow_early_resets": True}, ) assert env.get_attr("allow_early_resets")[0] is True
def ppo_stable_baselines_training(): wandb.run = config.tensorboard.run wandb.tensorboard.patch(save=False, tensorboardX=True) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) envs = make_vec_env(config.env_name, n_envs=config.num_processes) model = PPO("CnnPolicy", envs, verbose=1, tensorboard_log="./runs/", clip_range=config.clip_param, n_steps=50, learning_rate=config.lr, gamma=config.gamma, gae_lambda=config.gae_lambda, ent_coef=config.entropy_coef, max_grad_norm=config.max_grad_norm, vf_coef=config.value_loss_coef, batch_size=config.num_mini_batch) model.learn(total_timesteps=config.num_steps, log_interval=1, callback=WandbStableBaselines3Callback()) model.save(f"{config.env_name}_stable_baselines_ppo")
def test_vec_env_wrapper_kwargs(): env = make_vec_env("MountainCarContinuous-v0", n_envs=1, seed=0, wrapper_class=MaxAndSkipEnv, wrapper_kwargs={"skip": 3}) assert env.get_attr("_skip")[0] == 3
def init_adv(adv_env_id, disable_adv=False, env_kwargs=None): bridge = Bridge() default_env_kwargs = { 'renders' if 'CartPole' in adv_env_id else 'render': render } if env_kwargs is None: env_kwargs = {} env_kwargs.update(default_env_kwargs) env = make_vec_env(adv_env_id, env_kwargs=env_kwargs, seed=seed) env = VecNormalize(env) prot_agent = PPO('MlpPolicy', env, verbose=verbose, seed=seed, n_steps=ts, bridge=bridge, is_protagonist=True) if disable_adv: bridge.link_agents(prot_agent, None) else: adv_agent = PPO('MlpPolicy', env, verbose=verbose, seed=seed, n_steps=ts, bridge=bridge, is_protagonist=False) bridge.link_agents(prot_agent, adv_agent) return prot_agent, env
def test_multiprocessing(model_class): use_discrete_actions = model_class not in [SAC, TD3, DDPG] def make_env(): env = DummyDictEnv(use_discrete_actions=use_discrete_actions, channel_last=False) env = gym.wrappers.TimeLimit(env, 100) return env env = make_vec_env(make_env, n_envs=2, vec_env_cls=SubprocVecEnv) kwargs = {} n_steps = 256 if model_class in {A2C, PPO}: kwargs = dict( n_steps=128, policy_kwargs=dict( net_arch=[32], features_extractor_kwargs=dict(cnn_output_dim=32), ), ) model = model_class("MultiInputPolicy", env, gamma=0.5, seed=1, **kwargs) model.learn(total_timesteps=n_steps)
def create_zoo_env(env_id, stats_dir, hyperparams, should_render=False): env_wrapper = get_wrapper_class(hyperparams) vec_env_cls = DummyVecEnv if "Bullet" in env_id and should_render: vec_env_cls = SubprocVecEnv env = make_vec_env(env_id, wrapper_class=env_wrapper, vec_env_cls=vec_env_cls) if stats_dir is not None: if hyperparams["normalize"]: norm_fpath = pjoin(stats_dir, "vecnormalize.pkl") if os.path.exists(norm_fpath): env = VecNormalize.load(norm_fpath, env) env.training = False env.norm_reward = False else: raise ValueError(f"VecNormalize stats {norm_fpath} not found") max_episode_steps = gym.make(env_id).spec.max_episode_steps Spec = namedtuple("Spec", ["max_episode_steps"]) env.spec = Spec(max_episode_steps=max_episode_steps) return env
def get_env(op_policies, conf): env = make_vec_env(Expando, env_kwargs=dict(**conf, policies_other=op_policies), n_envs=1) env.reset() return env
def test_replay_buffer_normalization(replay_buffer_cls): env = {ReplayBuffer: DummyEnv, DictReplayBuffer: DummyDictEnv}[replay_buffer_cls] env = make_vec_env(env) env = VecNormalize(env) buffer = replay_buffer_cls(100, env.observation_space, env.action_space) # Interract and store transitions env.reset() obs = env.get_original_obs() for _ in range(100): action = env.action_space.sample() _, _, done, info = env.step(action) next_obs = env.get_original_obs() reward = env.get_original_reward() buffer.add(obs, next_obs, action, reward, done, info) obs = next_obs sample = buffer.sample(50, env) # Test observation normalization for observations in [sample.observations, sample.next_observations]: if isinstance(sample, DictReplayBufferSamples): for key in observations.keys(): assert th.allclose(observations[key].mean(0), th.zeros(1), atol=1) elif isinstance(sample, ReplayBufferSamples): assert th.allclose(observations.mean(0), th.zeros(1), atol=1) # Test reward normalization assert np.allclose(sample.rewards.mean(0), np.zeros(1), atol=1)
def create_environment(config): if config.atari_wrapper: env = make_atari_env(config.environment, n_envs=config.workers) env = VecFrameStack(env, n_stack = 1) else: env = make_vec_env(config.environment, n_envs=config.workers) env = DummyEnvWrapper(env, config.add_stoch) return env
def test_discrete_obs_space(model_class, env): env = make_vec_env(env, n_envs=2, seed=0) kwargs = {} if model_class == DQN: kwargs = dict(buffer_size=1000, learning_starts=100) else: kwargs = dict(n_steps=256) model_class("MlpPolicy", env, **kwargs).learn(256)
def make_vec_env(self,dataset, env_args): env_args["df"]= dataset env = make_vec_env('crypt-v001', env_kwargs=env_args) env = VecCheckNan(env, raise_exception=True) env = VecNormalize( env, norm_obs=True, norm_reward=False, clip_obs=10.0, gamma=0.95 ) return env
def test_warn_dqn_multi_env(): with pytest.warns(UserWarning, match="The number of environments used is greater"): DQN( "MlpPolicy", make_vec_env("CartPole-v1", n_envs=2), buffer_size=100, target_update_interval=1, )
def create_envs(self, n_envs: int, eval_env: bool = False, no_log: bool = False) -> VecEnv: """ Create the environment and wrap it if necessary. :param n_envs: :param eval_env: Whether is it an environment used for evaluation or not :param no_log: Do not log training when doing hyperparameter optim (issue with writing the same file) :return: the vectorized environment, with appropriate wrappers """ # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else self.save_path # env = SubprocVecEnv([make_env(env_id, i, self.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = make_vec_env( env_id=self.env_id, n_envs=n_envs, seed=self.seed, env_kwargs=self.env_kwargs, monitor_dir=log_dir, wrapper_class=self.env_wrapper, vec_env_cls=self.vec_env_class, vec_env_kwargs=self.vec_env_kwargs, ) # Special case for GoalEnvs: log success rate too if "Neck" in self.env_id or self.is_robotics_env(self.env_id): self._log_success_rate(env) # Wrap the env into a VecNormalize wrapper if needed # and load saved statistics when present env = self._maybe_normalize(env, eval_env) # Optional Frame-stacking if self.frame_stack is not None: n_stack = self.frame_stack env = VecFrameStack(env, n_stack) if self.verbose > 0: print(f"Stacking {n_stack} frames") # Wrap if needed to re-order channels # (switch from channel last to channel first convention) if is_image_space(env.observation_space): if self.verbose > 0: print("Wrapping into a VecTransposeImage") env = VecTransposeImage(env) # check if wrapper for dict support is needed if self.algo == "her": if self.verbose > 0: print("Wrapping into a ObsDictWrapper") env = ObsDictWrapper(env) return env
def train_from_logs(algo, env_id, eval_env=None, log_dir="logs", total_timesteps=300000, tensorboard_log=None, seed=0, verbose=0, n_envs=4, outdir="results", use_sde=True, i=0): if eval_env is None: eval_env = env_id # create env if (algo in ["a2c", "ppo"]): env = make_vec_env(env_id, n_envs=n_envs, seed=seed) else: env = make_vec_env(env_id, n_envs=1, seed=seed) # Create and train agent agent = AGENT[algo] hyper = best_hyperpars(log_dir, env_id, algo, i=i) print("") print(algo, env_id) print(hyper) # Unless turned off in hyperparameters.yml # env = VecNormalize(env, gamma = hyper["params_gamma"]) model = agent(env, hyper, 'MlpPolicy', verbose=verbose, tensorboard_log=tensorboard_log, seed=seed, use_sde=use_sde) model.learn(total_timesteps=total_timesteps) # evaluate agent custom_eval(model, eval_env, algo, seed=seed, outdir=outdir, value=hyper["value"])
def multiprocessing_with_off_policy_algorithms_example(): # Multiprocessing with off-policy algorithms. env = make_vec_env("Pendulum-v1", n_envs=4, seed=0) # We collect 4 transitions per call to 'env.step()' and performs 2 gradient steps per call to 'env.step()' # if gradient_steps=-1, then we would do 4 gradients steps per call to 'env.step()'. model = SAC("MlpPolicy", env, train_freq=1, gradient_steps=2, verbose=1) model.learn(total_timesteps=10_000)
def main(): # Create the callback: check every 1000 steps log_dir = 'log' callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) num_cpu = 16 model_stats_path = os.path.join(log_dir, "sac_" + env_name) env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl') tb_log = 'tb_log' videoName = '5M_timesteps_sac' tb_log_name = videoName if(StartFresh): # env = make_vec_env(env_name, n_envs=4) # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch':[128,64,32], } model = PPO('MlpPolicy', env, learning_rate = 0.001, n_steps=500, # batch_size=0, # n_epochs=1, gamma=0.9, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize.load(env_stats_path, env) env.reset() model = PPO.load(model_stats_path, tensorboard_log=tb_log) model.set_env(env) if(DoTraining): eval_env = make_vec_env(env_name, n_envs=1) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log) model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback() # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): # mean_reward, std_reward = evaluate_policy(model, eval_env) # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}") record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
def process(layers, case_number, steps, envs, verbose): name = f"c3a_A2C_{str(layers).replace(' ', '')}_{case_number}" print(f"Case: {name}") env = make_vec_env('PerigeeRaising-Continuous3D-v0', n_envs=envs, wrapper_class=lambda x: wrap(x)) agent = create_agent(env, name, case_number, layers, verbose) print(f" --> Training...") train_agent(agent, name, steps=steps, callbacks=[]) print(f" --> Testing...") test_agent(agent)
def make_env(rank: int, count: int) -> VecEnv: return make_vec_env( ENV_NAME, n_envs=count, seed=RANDOM_SEED + rank, start_index=0, monitor_dir=None, wrapper_class=atari_wrapper, env_kwargs=None, vec_env_cls=None, vec_env_kwargs=None, monitor_kwargs=None, )
def test_offpolicy_multi_env(model_class): kwargs = {} if model_class in [SAC, TD3, DDPG]: env_id = "Pendulum-v0" policy_kwargs = dict(net_arch=[64], n_critics=1) # Check auto-conversion to VectorizedActionNoise kwargs = dict(action_noise=NormalActionNoise(np.zeros(1), 0.1 * np.ones(1))) if model_class == SAC: kwargs["use_sde"] = True kwargs["sde_sample_freq"] = 4 else: env_id = "CartPole-v1" policy_kwargs = dict(net_arch=[64]) def make_env(): env = gym.make(env_id) # to check that the code handling timeouts runs env = gym.wrappers.TimeLimit(env, 50) return env env = make_vec_env(make_env, n_envs=2) model = model_class( "MlpPolicy", env, policy_kwargs=policy_kwargs, learning_starts=100, buffer_size=10000, verbose=0, train_freq=5, **kwargs, ) model.learn(total_timesteps=150) # Check that gradient_steps=-1 works as expected: # perform as many gradient_steps as transitions collected train_freq = 3 model = model_class( "MlpPolicy", env, policy_kwargs=policy_kwargs, learning_starts=0, buffer_size=10000, verbose=0, train_freq=train_freq, gradient_steps=-1, **kwargs, ) model.learn(total_timesteps=train_freq) assert model.logger.name_to_value[ "train/n_updates"] == train_freq * env.num_envs
def main(): # get init time and use it for save path now = datetime.now() save_path = './trained/' + now.strftime("%B %d, %Y - %H.%M") os.mkdir(save_path) # using sound library for pure fun engine = pyttsx3.init() # object creation engine.setProperty('rate', 150) # setting up new voice rate with open('config.yml') as file: configurations = yaml.safe_load(file) configurations['general']['flightgear'] = 'false' configurations['general']['agent_interaction_freq'] = 5 with open('config.yml', 'w') as file: yaml.dump(configurations, file) env_make = make_vec_env(configurations['general']['env'], n_envs=1, seed=0) env = VecNormalize(env_make, norm_obs=True, norm_reward=True, clip_obs=10.) # Stop training when the model reaches the reward threshold callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=300, verbose=1) eval_callback = EvalCallback( env, callback_on_new_best=callback_on_best, best_model_save_path=save_path, eval_freq=configurations['train']['timesteps'] / 100, deterministic=True) with open(save_path + '/env.pkl', "wb") as file_handler: pickle.dump(env, file_handler, pickle.HIGHEST_PROTOCOL) if configurations['train']['model'] == "none": print("--> Alican's LOG: A new model will be created for training") model = Agents.create_model(env, configurations['general']['algorithm'], save_path) else: print( "--> Alican's LOG: An already existed model will be used for training" ) model = Agents.load_model( env, configurations['general']['algorithm'], configurations['train']['model'] + '/best_model') model.learn(total_timesteps=configurations['train']['timesteps'], callback=eval_callback, log_interval=20) engine.say("Training is finished!") engine.runAndWait() engine.stop()
def main(): test_or_train = TEST_OR_TRAIN assert test_or_train in ["train", "test"] env_params = { 'time_step': TIME_STEP, 'robot_class': QuadrupedRobot, 'on_rack': False, 'enable_self_collision': True, 'motor_control_mode': MotorControlMode.HYBRID_COMPUTED_POS_TROT, 'train_or_test': test_or_train } policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/policies') policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") policy_save_path = os.path.join(policy_save_dir, policy_save_filename) policy_kwargs = {"net_arch": [{"pi": [512, 256], "vf": [512, 256]}]} if TEST_OR_TRAIN == "train": env = make_vec_env(env_change_input, n_envs=NUM_CPUS, seed=0, env_kwargs=env_params, vec_env_cls=SubprocVecEnv) env = VecNormalize(env) if not (os.path.exists(policy_save_dir)): os.makedirs(policy_save_dir) model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=1) model.learn(total_timesteps=100000000) model.save(policy_save_path) else: # env = env_change_input(time_step=env_params['time_step'], # robot_class=env_params['robot_class'], # on_rack=env_params['on_rack'], # enable_self_collision=env_params['enable_self_collision'], # motor_control_mode=env_params['motor_control_mode'], # train_or_test=env_params['train_or_test']) env = env_change_input(**env_params) model_load_path = os.path.join(policy_save_dir, 'ppo_3_17-03-2021_15-39-42') model = PPO.load(model_load_path) obs = env.reset() while True: action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
def train_and_test_ec(config, video_length_=1000, total_timesteps_=10000): print(config) if config.atari_wrapper: train_env = make_atari_env(config.environment, n_envs=config.workers) train_env = VecFrameStack(train_env, n_stack=1) shape = (84, 84, 1) else: train_env = make_vec_env(config.environment, n_envs=config.workers) shape = train_env.observation_space.shape rnet = RNetwork(shape, config.ensemble_size) vec_episodic_memory = [ EpisodicMemory([64], rnet.embedding_similarity, replacement='random', capacity=200) for _ in range(config.workers) ] target_image_shape = list(shape) #assert type(config.add_stoch) == bool, "Please indicated whether or not you want stoch added" train_env = CuriosityEnvWrapper(train_env, vec_episodic_memory, rnet.embed_observation, target_image_shape, config.add_stoch) r_network_trainer = RNetworkTrainer(rnet, learning_rate=config.rnet_lr, observation_history_size=2000, training_interval=1000) train_env.add_observer(r_network_trainer) tb_dir = os.path.join(config.log_dir, config.tb_subdir) model = config.agent(config.policy_model, train_env, config, verbose=config.verbose, tensorboard_log=tb_dir) model.learn(total_timesteps=total_timesteps_) print("stopped to learn") #model.save("models/"+config.experiment) obs = train_env.reset() for i in range(video_length_ + 1): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = train_env.step(action) train_env.render() if done.any(): obs = train_env.reset() train_env.close()
def create_envs(self, n_envs: int, eval_env: bool = False, no_log: bool = False) -> VecEnv: """ Create the environment and wrap it if necessary. :param n_envs: :param eval_env: Whether is it an environment used for evaluation or not :param no_log: Do not log training when doing hyperparameter optim (issue with writing the same file) :return: the vectorized environment, with appropriate wrappers """ # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else self.save_path monitor_kwargs = {} # Special case for GoalEnvs: log success rate too if "Neck" in self.env_id or self.is_robotics_env(self.env_id) or "parking-v0" in self.env_id: monitor_kwargs = dict(info_keywords=("is_success",)) # On most env, SubprocVecEnv does not help and is quite memory hungry # therefore we use DummyVecEnv by default env = make_vec_env( env_id=self.env_id, n_envs=n_envs, seed=self.seed, env_kwargs=self.env_kwargs, monitor_dir=None, # Avoid useless monitor file spam from plotting wrapper_class=self.env_wrapper, vec_env_cls=self.vec_env_class, vec_env_kwargs=self.vec_env_kwargs, monitor_kwargs=monitor_kwargs, ) # Wrap the env into a VecNormalize wrapper if needed # and load saved statistics when present env = self._maybe_normalize(env, eval_env) # Optional Frame-stacking if self.frame_stack is not None: n_stack = self.frame_stack env = VecFrameStack(env, n_stack) if self.verbose > 0: print(f"Stacking {n_stack} frames") # Wrap if needed to re-order channels # (switch from channel last to channel first convention) if is_image_space(env.observation_space) and not is_image_space_channels_first(env.observation_space): if self.verbose > 0: print("Wrapping into a VecTransposeImage") env = VecTransposeImage(env) return env
def test_make_vec_env(env_id, n_envs, vec_env_cls, wrapper_class): env = make_vec_env(env_id, n_envs, vec_env_cls=vec_env_cls, wrapper_class=wrapper_class, monitor_dir=None, seed=0) assert env.num_envs == n_envs if vec_env_cls is None: assert isinstance(env, DummyVecEnv) if wrapper_class is not None: assert isinstance(env.envs[0], wrapper_class) else: assert isinstance(env.envs[0], Monitor) else: assert isinstance(env, SubprocVecEnv) # Kill subprocesses env.close()
def run_exp(exp_params): activation_fn = activation_fns[exp_params['act']] layers = [int(nb) for nb in exp_params['layers'].split(',')] nb_threads = int(exp_params['nb_threads']) freq_save = int(exp_params['save_every']) env_id = 'Trading-{}'.format(envs[exp_params['env']]) train_steps = int(exp_params['train_steps']) * 10e4 tmp_env = gym.make(env_id) tmp_env.reset() env_data = tmp_env.get_env_specs() trained_agents = glob.glob('trained_models/{}/*'.format( env_data['folder_name'] if 'folder_name' in env_data.keys() else env_data['env_name'])) run_idx = len([agent for agent in trained_agents if 'agent' in agent]) run_name = 'agent_{:03d}'.format(run_idx) model = PPO('MlpPolicy', make_vec_env(env_id, nb_threads), verbose=1, device=torch.device('cpu'), tensorboard_log='./runs/{}/'.format( env_data['folder_name'] if 'folder_name' in env_data.keys() else env_data['env_name'])) model.learn(total_timesteps=train_steps, tb_log_name=run_name) env_data = model.env.envs[0].get_env_specs() env_data['run_name'] = run_name env_folder = 'trained_models/{}'.format( env_data['folder_name'] if 'folder_name' in env_data.keys() else env_data['env_name']) if not os.path.exists('trained_models'): os.mkdir('trained_models') if not os.path.exists(env_folder): os.mkdir(env_folder) if not os.path.exists('{}/{}'.format(env_folder, run_name)): os.mkdir('{}/{}'.format(env_folder, run_name)) model.save('{}/{}/{}'.format(env_folder, run_name, run_name)) recap = pd.Series(env_data.values(), index=env_data.keys()) recap.to_csv('{}/{}/recap.csv'.format(env_folder, run_name), index=True)
def test_evaluate_vector_env(n_envs): # Tests that the number of episodes evaluated is correct n_eval_episodes = 6 env = make_vec_env("CartPole-v1", n_envs) model = A2C("MlpPolicy", "CartPole-v1", seed=0) class CountCallback: def __init__(self): self.count = 0 def __call__(self, locals_, globals_): if locals_["done"]: self.count += 1 count_callback = CountCallback() evaluate_policy(model, env, n_eval_episodes, callback=count_callback) assert count_callback.count == n_eval_episodes
def train(args): cuda_availability = torch.cuda.is_available() print('\n*************************') print('`CUDA` available: {}'.format(cuda_availability)) print('Device specified: {}'.format(args.device)) print('*************************\n') # load the config of the trained model: with open(args.pretrained_output / "train_arguments.yaml") as yaml_data: pretrain_arguments = yaml.load(yaml_data, Loader=yaml.FullLoader) pretrained_model = algorithms[pretrain_arguments["alg"]].load( args.pretrained_output / "".join(pretrain_arguments["model_name"].split(".")[:-1]), device='cpu') # Prepare tensorboard logging log_name = '{}_{}'.format(pretrain_arguments["experiment_name"], datetime.now().strftime('%d-%m_%H-%M-%S')) run_dir = args.tensorboard_log + "/" + log_name Path(run_dir).mkdir(parents=True, exist_ok=True) callbacks = [] # callbacks.append(CheckpointCallback( # save_freq=1000000, save_path=run_dir, name_prefix='rl_model')) callbacks.append(LoggingCallback(logpath=run_dir)) train_args = copy.copy(pretrain_arguments) pyaml.dump(train_args, open(os.path.join(run_dir, 'train_arguments.yaml'), 'w')) # Create the vectorized environment n_envs = pretrain_arguments["n_envs"] # Number of processes to use env = make_vec_env(pretrain_arguments["task_name"], n_envs=n_envs) pretrained_model.env = env pretrained_model.learn(total_timesteps=args.total_timesteps, callback=callbacks, tb_log_name=log_name) pretrained_model.save( os.path.join(args.tensorboard_log + "/" + log_name, args.model_name))
def objective(trial): # Getting the hyperparameters to test params, policy_kwargs = algo_utils[args.algorithm][0](trial) # Flag to keep track of whether using vectorized environment or not # Instatiating the environments env = make_vec_env(args.env, n_envs=params["n_envs"]) params.pop("n_envs") # Instatiating model and performing training model = algo_utils[args.algorithm][1]("MlpPolicy", env, verbose=0, policy_kwargs=policy_kwargs, **params) model.learn(total_timesteps=int(args.n_timesteps)) # Evaluating the agent and reporting the mean cumulative reward eval_env = gym.make(args.env) eval_df = simulate_mdp_vec(env, eval_env, model, args.n_eval_episodes) mean_rew = eval_df.groupby(["rep"]).sum().mean(axis=0)["reward"] del model return mean_rew