def test_channel_first_env(tmp_path): # test_cnn uses environment with HxWxC setup that is transposed, but we # also want to work with CxHxW envs directly without transposing wrapper. SAVE_NAME = "cnn_model.zip" # Create environment with transposed images (CxHxW). # If underlying CNN processes the data in wrong format, # it will raise an error of negative dimension sizes while creating convolutions env = FakeImageEnv(screen_height=40, screen_width=40, n_channels=1, discrete=True, channel_first=True) model = A2C("CnnPolicy", env, n_steps=100).learn(250) assert not is_vecenv_wrapped(model.get_env(), VecTransposeImage) obs = env.reset() action, _ = model.predict(obs, deterministic=True) model.save(tmp_path / SAVE_NAME) del model model = A2C.load(tmp_path / SAVE_NAME) # Check that the prediction is the same assert np.allclose(action, model.predict(obs, deterministic=True)[0]) os.remove(str(tmp_path / SAVE_NAME))
def test_env_auto_monitor_wrap(): env = gym.make("Pendulum-v0") model = A2C("MlpPolicy", env) assert model.env.env_is_wrapped(Monitor)[0] is True env = Monitor(env) model = A2C("MlpPolicy", env) assert model.env.env_is_wrapped(Monitor)[0] is True model = A2C("MlpPolicy", "Pendulum-v0") assert model.env.env_is_wrapped(Monitor)[0] is True
def test_a2c(env_id): model = A2C("MlpPolicy", env_id, seed=0, gamma=0.98, normalize_advantage=True, max_grad_norm=1, use_rms_prop=True, gae_lambda=0.9, n_steps=1, learning_rate=0.00033449110737887957, ent_coef=0.03826151159203985, vf_coef=0.862067985941033, buffer_size=10000, batch_size=512, learning_starts=3000, policy_kwargs=dict(net_arch=[dict(pi=[64, 64], vf=[64, 64])], activation_fn=torch.nn.ReLU, ortho_init=False), verbose=1, create_eval_env=True) eval_env = gym.make(env_id) eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=250, deterministic=False, render=False) model.learn(total_timesteps=50000, eval_freq=100)
def test_evaluate_policy(): model = A2C("MlpPolicy", "Pendulum-v0", seed=0) n_steps_per_episode, n_eval_episodes = 200, 2 model.n_callback_calls = 0 def dummy_callback(locals_, _globals): locals_["model"].n_callback_calls += 1 _, episode_lengths = evaluate_policy( model, model.get_env(), n_eval_episodes, deterministic=True, render=False, callback=dummy_callback, reward_threshold=None, return_episode_rewards=True, ) n_steps = sum(episode_lengths) assert n_steps == n_steps_per_episode * n_eval_episodes assert n_steps == model.n_callback_calls # Reaching a mean reward of zero is impossible with the Pendulum env with pytest.raises(AssertionError): evaluate_policy(model, model.get_env(), n_eval_episodes, reward_threshold=0.0) episode_rewards, _ = evaluate_policy(model, model.get_env(), n_eval_episodes, return_episode_rewards=True) assert len(episode_rewards) == n_eval_episodes
def a2c(env, hyper, policy = "MlpPolicy", tensorboard_log = None, verbose = 1, seed = 0, use_sde = True, sde_sample_freq = -1, rms_prop_eps = 1e-05, device = "auto"): lr_schedule = hyper["params_lr_schedule"] learning_rate = hyper["params_lr"] if lr_schedule == "linear": learning_rate = linear_schedule(learning_rate) policy_kwargs = make_policy_kwargs(hyper, "a2c") model = A2C(policy, env, tensorboard_log=tensorboard_log, verbose = verbose, seed = seed, use_sde = use_sde, sde_sample_freq = sde_sample_freq, rms_prop_eps = rms_prop_eps, learning_rate = learning_rate, n_steps = np.int(hyper["params_n_steps"]), gamma = hyper["params_gamma"], gae_lambda = hyper["params_gae_lambda"], ent_coef = hyper["params_ent_coef"], vf_coef = hyper["params_vf_coef"], max_grad_norm = hyper["params_max_grad_norm"], use_rms_prop = hyper["params_use_rms_prop"], normalize_advantage = hyper["params_normalize_advantage"], policy_kwargs = policy_kwargs, device = device ) return model
def train(params): model = A2C(params.get("policy"), multi_env, verbose=1, tensorboard_log=log_dir, learning_rate=params.get("learning_rate"), n_steps=params.get("n_steps"), gamma=params.get("gamma"), gae_lambda=params.get("gae_lambda"), ent_coef=params.get("ent_coef"), vf_coef=params.get("vf_coef"), max_grad_norm=params.get("max_grad_norm"), rms_prop_eps=params.get("rms_prop_eps"), use_rms_prop=params.get("use_rms_prop"), use_sde=params.get("use_sde"), sde_sample_freq=params.get("sde_sample_freq"), normalize_advantage=params.get("normalize_advantage"), policy_kwargs=dict( net_arch=[256, 256, dict(vf=[256], pi=[16])])) # Train for 1e5 steps model.learn(total_timesteps=params.get("train_steps"), eval_env=env) # Save the trained agent model.save(exp_name)
def test_eval_friendly_error(): # tests that eval callback does not crash when given a vector train_env = VecNormalize(DummyVecEnv([lambda: gym.make("CartPole-v1")])) eval_env = DummyVecEnv([lambda: gym.make("CartPole-v1")]) eval_env = VecNormalize(eval_env, training=False, norm_reward=False) _ = train_env.reset() original_obs = train_env.get_original_obs() model = A2C("MlpPolicy", train_env, n_steps=50, seed=0) eval_callback = EvalCallback( eval_env, eval_freq=100, warn=False, ) model.learn(100, callback=eval_callback) # Check synchronization assert np.allclose(train_env.normalize_obs(original_obs), eval_env.normalize_obs(original_obs)) wrong_eval_env = gym.make("CartPole-v1") eval_callback = EvalCallback( wrong_eval_env, eval_freq=100, warn=False, ) with pytest.warns(Warning): with pytest.raises(AssertionError): model.learn(100, callback=eval_callback)
def test_a2c(env_id): model = A2C("MlpPolicy", env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True) model.learn(total_timesteps=1000, eval_freq=500)
def main(): #env_id = "CartPole-v1" vix_env = trading_vix_env.trading_vix_env() num_cpu = 20 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)]) model = A2C('MlpPolicy', env, verbose=1, n_steps=5) model.learn(total_timesteps=2500000000)
def train_A2C(env_train, model_name, timesteps=25000): """A2C model""" start = time.time() model = A2C("MlpPolicy", env_train, verbose=0) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") return model, (end - start) / 60
def dummy_model_distribution_obs_and_actions() -> Tuple[A2C, np.array, np.array]: """ Fixture creating a Pendulum-v1 gym env, an A2C model and sampling 10 random observations and actions from the env :return: A2C model, random observations, random actions """ env = gym.make("Pendulum-v1") model = A2C("MlpPolicy", env, seed=23) random_obs = np.array([env.observation_space.sample() for _ in range(10)]) random_actions = np.array([env.action_space.sample() for _ in range(10)]) return model, random_obs, random_actions
def train(time_steps, save=False, **params): env = A2CAgent.create_env(1) model = A2C('CnnPolicy', env, verbose=params.get('verbose', 1), tensorboard_log=TB_LOGS, ent_coef=0.01) model.learn(total_timesteps=time_steps) if save: model.save(MODEL_PATH)
def train_A2C(env_train, model_name, timesteps=25000): """A2C model""" start = time.time() model = A2C('MlpPolicy', env_train, verbose=0) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (A2C): ', (end - start) / 60, ' minutes') return model
def __init__(self, env, hyperparameters=DEFAULT_HYPERPARAMETERS): self.P = hyperparameters if self.P["model_class"] == "dqn": from stable_baselines3 import DQN self.model = DQN('MlpPolicy', env, verbose=self.P["verbose"]) self.model_class = DQN elif self.P["model_class"] == "a2c": from stable_baselines3 import A2C from stable_baselines3.a2c import MlpPolicy self.model = A2C(MlpPolicy, env, verbose=self.P["verbose"]) self.model_class = A2C elif self.P["model_class"] == "ddpg": from stable_baselines3 import DDPG from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) self.model = DDPG('MlpPolicy', env, action_noise=action_noise, verbose=self.P["verbose"]) self.model_class = DDPG elif self.P["model_class"] == "td3": from stable_baselines3 import TD3 from stable_baselines3.td3.policies import MlpPolicy from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) self.model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=self.P["verbose"]) self.model_class = TD3 elif self.P["model_class"] == "ppo": from stable_baselines3 import PPO from stable_baselines3.ppo import MlpPolicy self.model = PPO(MlpPolicy, env, verbose=self.P["verbose"]) self.model_class = PPO elif self.P["model_class"] == "sac": from stable_baselines3 import SAC from stable_baselines3.sac import MlpPolicy self.model = SAC(MlpPolicy, env, verbose=self.P["verbose"]) self.model_class = SAC else: raise NotImplementedError()
def getting_started(): env = gym.make("CartPole-v1") model = A2C("MlpPolicy", env, verbose=1) #model = A2C("MlpPolicy", "CartPole-v1", verbose=1) model.learn(total_timesteps=10000) obs = env.reset() for i in range(1000): action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
def test_eval_callback_vec_env(): # tests that eval callback does not crash when given a vector n_eval_envs = 3 train_env = IdentityEnv() eval_env = DummyVecEnv([lambda: IdentityEnv()] * n_eval_envs) model = A2C("MlpPolicy", train_env, seed=0) eval_callback = EvalCallback( eval_env, eval_freq=100, warn=False, ) model.learn(300, callback=eval_callback) assert eval_callback.last_mean_reward == 100.0
def test_multiple_stable_baselines(self): env_name = "CartPole-v0" env = gym.make(env_name) models = [ DQN("MlpPolicy", gym.make(env_name), learning_rate=1e-3), A2C(policy="MlpPolicy", env=gym.make(env_name), verbose=1), PPO(policy="MlpPolicy", env=gym.make(env_name), verbose=1), ] model_names = ["Simple DQN", "A2C", "PPO"] train_multiple(models, env, 1470, 195, model_names, 200) trained_env = get_saved_environments()[0] trained_models = get_trained_model_names(trained_env) model_saved = set(model_names) == set(trained_models) shutil.rmtree(save_path) self.assertTrue(model_saved)
def atari_games_example(): # There already exists an environment generator that will make and wrap atari environments correctly. # Here we are also multi-worker training (n_envs=4 => 4 environments). env = make_atari_env("PongNoFrameskip-v4", n_envs=4, seed=0) # Frame-stacking with 4 frames. env = VecFrameStack(env, n_stack=4) model = A2C("CnnPolicy", env, verbose=1) model.learn(total_timesteps=25_000) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def train(self, timesteps=10000, continue_training=False): start_time = time.time() if not continue_training: print("Initializing from scratch") model = A2C(self.policy_name, self.env, verbose=1, tensorboard_log=self.log_dir) else: model = self.load_model() print("Restored from {}".format(self.model_path)) model.learn(total_timesteps=timesteps) print('\nTraining complete. Time taken = {} secs'.format(time.time() - start_time)) model.save(self.model_path)
def main(): #env_id = "CartPole-v1" vix_env = trading_vix_env.trading_vix_env() num_cpu = 20 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)]) # Create log dir log_dir = './a2c_data' os.makedirs(log_dir, exist_ok=True) env = VecMonitor(env, log_dir) callback = custom_call_back.CustomCallback(check_freq=1000, log_dir=log_dir) model = A2C('MlpPolicy', env, verbose=1, n_steps=5) model.learn(total_timesteps=2500000000, callback=callback)
def a2c(path): env = make_env(HumanPlayer()) eval_env = make_env(RandomPlayer()) model = A2C.load(path, env, verbose=1) mean, std = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f"Loaded policy: mean={mean:.2f} +/- {std}") # Show how well we learned by plating a game: obs = env.reset() done = False while not done: action, _state = model.predict(obs) obs, reward, done, info = env.step(action) print(f"{info['turn']: <4} | Reward: {reward: >4} | {info['winner']}") env.render() print("done")
def train(env, type, timesteps): env.reset() print(check_env(env)) env = FlattenObservation(env) print(env.reward_range) print(env.action_space) if type == "DQN": model = DQN('MlpPolicy', exploration_fraction=0.999, env=env, verbose=1) elif type == "A2C": model = A2C('MlpPolicy', env=env, verbose=1) elif type == "PPO": model = PPO('MlpPolicy', env=env, verbose=1) model.learn(total_timesteps=timesteps) model.save("model_cups")
def train_rl_agent(ticker): # initialize training structures train_data_path = '../data/{}_train.csv'.format(ticker.lower()) train_data = pd.read_csv(train_data_path) env = SingleStockTradingEnv(train_data_path, engineer_features, initial_value=INITIAL_PORTFOLIO_VALUE, borrowing=BORROWING, long_only=LONG_ONLY) # create and train agent agent = A2C('MlpPolicy', env, gamma=0.1) for i in range(10): print(ticker, i, env.data.shape[0]) env.reset() agent.learn(env.data.shape[0] ) # go through whole history based on each training run agent.save('checkpoints/{}_rl_no_restrictions'.format(ticker.lower()))
def train_A2C(self, model_name, model_params = A2C_PARAMS): """A2C model""" from stable_baselines3 import A2C env_train = self.env start = time.time() model = A2C('MlpPolicy', env_train, n_steps = model_params['n_steps'], ent_coef = model_params['ent_coef'], learning_rate = model_params['learning_rate'], verbose = model_params['verbose'], tensorboard_log = f"{zvt_env['log_path']}/{model_name}" ) model.learn(total_timesteps=model_params['timesteps'], tb_log_name = "A2C_run") end = time.time() model.save(f"{zvt_env['model_path']}/{model_name}") print('Training time (A2C): ', (end-start)/60,' minutes') return model
def evaluate(params): # Load saved model model = A2C.load(exp_name, env=env) results = np.zeros(shape=(0,0)) obs = env.reset() # Evaluate the agent episode_reward = 0 for _ in range(params.get("test_episodes")): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) episode_reward += reward if done or info.get('is_success', False): episode_reward = 0.0 obs = env.reset() result = ("Reward:", episode_reward, "Success?", info.get('is_success', True)) results = np.append(results, result, axis=None)
def __init__(self, model='a2c', use_gp=False, gp_params=None, **kwargs): # wrapper around stable_baselines RL implemenetations assert model in ACCEPTED_MODELS, 'Unknown RL model, must be in {}'.format(ACCEPTED_MODELS) if model == 'a2c': self.rl = A2C(**kwargs) elif model == 'ppo': self.rl = PPO(**kwargs) elif model == 'dqn': self.rl = DQN(**kwargs) elif model == 'td3': self.rl = TD3(**kwargs) self.use_gp = use_gp if self.use_gp: assert gp_params is not None, 'Must provide parameters such as training data, number of iterations, etc. for GPR' self.n_train = gp_params['n_train'] self.retraining_iter = gp_params['training_iter'] self.cvar_limit = gp_params['cvar_limit'] self.gp_limit = gp_params['gp_limit'] self.likelihood = gpytorch.likelihoods.GaussianLikelihood() if 'data' in gp_params.keys(): self.X_train = gp_params['data']['X_train'] self.y_train = gp_params['data']['y_train'] else: self.X_train = torch.zeros(self.n_train, kwargs['env'].num_features) # hard coded to match dimensions of features self.y_train = torch.zeros(self.n_train) self.gp = ExactGPModel(self.X_train, self.y_train, self.likelihood) self.mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.gp) self.opt = torch.optim.Adam(self.gp.parameters(), lr=0.1) self.shares = 0 self.cash = 0 self.obs = [] # holds up to 2 past observations, helps in keeping X, y aligned # for plotting self.pred_return = 0 self.pred_lower = 0 self.pred_upper = 0 # for debugging self.goal_num_shares = 0
def train_A2C(self, model_name, model_params = config.A2C_PARAMS): """A2C model""" from stable_baselines3 import A2C from stable_baselines3.a2c import MlpPolicy env_train = self.env start = time.time() model = A2C('MlpPolicy', env_train, n_steps = model_params['n_steps'], ent_coef = model_params['ent_coef'], learning_rate = model_params['learning_rate'], verbose = model_params['verbose'], tensorboard_log = f"{config.TENSORBOARD_LOG_DIR}/{model_name}" ) model.learn(total_timesteps=model_params['timesteps'], tb_log_name = "A2C_run") end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (A2C): ', (end-start)/60,' minutes') return model
def test_set_logger(tmp_path): # set up logger new_logger = configure(str(tmp_path), ["stdout", "csv", "tensorboard"]) # Default outputs with verbose=0 model = A2C("MlpPolicy", "CartPole-v1", verbose=0).learn(4) assert model.logger.output_formats == [] model = A2C("MlpPolicy", "CartPole-v1", verbose=0, tensorboard_log=str(tmp_path)).learn(4) assert str(tmp_path) in model.logger.dir assert isinstance(model.logger.output_formats[0], TensorBoardOutputFormat) # Check that env variable work new_tmp_path = str(tmp_path / "new_tmp") os.environ["SB3_LOGDIR"] = new_tmp_path model = A2C("MlpPolicy", "CartPole-v1", verbose=0).learn(4) assert model.logger.dir == new_tmp_path # Default outputs with verbose=1 model = A2C("MlpPolicy", "CartPole-v1", verbose=1).learn(4) assert isinstance(model.logger.output_formats[0], HumanOutputFormat) # with tensorboard model = A2C("MlpPolicy", "CartPole-v1", verbose=1, tensorboard_log=str(tmp_path)).learn(4) assert isinstance(model.logger.output_formats[0], HumanOutputFormat) assert isinstance(model.logger.output_formats[1], TensorBoardOutputFormat) assert len(model.logger.output_formats) == 2 model.learn(32) # set new logger model.set_logger(new_logger) # Check that the new logger is correctly setup assert isinstance(model.logger.output_formats[0], HumanOutputFormat) assert isinstance(model.logger.output_formats[1], CSVOutputFormat) assert isinstance(model.logger.output_formats[2], TensorBoardOutputFormat) assert len(model.logger.output_formats) == 3 model.learn(32) model = A2C("MlpPolicy", "CartPole-v1", verbose=1) model.set_logger(new_logger) model.learn(32) # Check that the new logger is not overwritten assert isinstance(model.logger.output_formats[0], HumanOutputFormat) assert isinstance(model.logger.output_formats[1], CSVOutputFormat) assert isinstance(model.logger.output_formats[2], TensorBoardOutputFormat) assert len(model.logger.output_formats) == 3
def test_evaluate_vector_env(n_envs): # Tests that the number of episodes evaluated is correct n_eval_episodes = 6 env = make_vec_env("CartPole-v1", n_envs) model = A2C("MlpPolicy", "CartPole-v1", seed=0) class CountCallback: def __init__(self): self.count = 0 def __call__(self, locals_, globals_): if locals_["done"]: self.count += 1 count_callback = CountCallback() evaluate_policy(model, env, n_eval_episodes, callback=count_callback) assert count_callback.count == n_eval_episodes
def make_gif_example(): # Make a GIF of a Trained Agent. import imageio model = A2C("MlpPolicy", "LunarLander-v2").learn(100_000) images = [] obs = model.env.reset() img = model.env.render(mode="rgb_array") for i in range(350): images.append(img) action, _ = model.predict(obs) obs, _, _, _ = model.env.step(action) img = model.env.render(mode="rgb_array") imageio.mimsave( "lander_a2c.gif", [np.array(img) for i, img in enumerate(images) if i % 2 == 0], fps=29)