コード例 #1
0
def test_channel_first_env(tmp_path):
    # test_cnn uses environment with HxWxC setup that is transposed, but we
    # also want to work with CxHxW envs directly without transposing wrapper.
    SAVE_NAME = "cnn_model.zip"

    # Create environment with transposed images (CxHxW).
    # If underlying CNN processes the data in wrong format,
    # it will raise an error of negative dimension sizes while creating convolutions
    env = FakeImageEnv(screen_height=40,
                       screen_width=40,
                       n_channels=1,
                       discrete=True,
                       channel_first=True)

    model = A2C("CnnPolicy", env, n_steps=100).learn(250)

    assert not is_vecenv_wrapped(model.get_env(), VecTransposeImage)

    obs = env.reset()

    action, _ = model.predict(obs, deterministic=True)

    model.save(tmp_path / SAVE_NAME)
    del model

    model = A2C.load(tmp_path / SAVE_NAME)

    # Check that the prediction is the same
    assert np.allclose(action, model.predict(obs, deterministic=True)[0])

    os.remove(str(tmp_path / SAVE_NAME))
コード例 #2
0
def test_env_auto_monitor_wrap():
    env = gym.make("Pendulum-v0")
    model = A2C("MlpPolicy", env)
    assert model.env.env_is_wrapped(Monitor)[0] is True

    env = Monitor(env)
    model = A2C("MlpPolicy", env)
    assert model.env.env_is_wrapped(Monitor)[0] is True

    model = A2C("MlpPolicy", "Pendulum-v0")
    assert model.env.env_is_wrapped(Monitor)[0] is True
def test_a2c(env_id):
    model = A2C("MlpPolicy",
                env_id,
                seed=0,
                gamma=0.98,
                normalize_advantage=True,
                max_grad_norm=1,
                use_rms_prop=True,
                gae_lambda=0.9,
                n_steps=1,
                learning_rate=0.00033449110737887957,
                ent_coef=0.03826151159203985,
                vf_coef=0.862067985941033,
                buffer_size=10000,
                batch_size=512,
                learning_starts=3000,
                policy_kwargs=dict(net_arch=[dict(pi=[64, 64], vf=[64, 64])],
                                   activation_fn=torch.nn.ReLU,
                                   ortho_init=False),
                verbose=1,
                create_eval_env=True)
    eval_env = gym.make(env_id)
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path='./logs/',
                                 log_path='./logs/',
                                 eval_freq=250,
                                 deterministic=False,
                                 render=False)
    model.learn(total_timesteps=50000, eval_freq=100)
コード例 #4
0
def test_evaluate_policy():
    model = A2C("MlpPolicy", "Pendulum-v0", seed=0)
    n_steps_per_episode, n_eval_episodes = 200, 2
    model.n_callback_calls = 0

    def dummy_callback(locals_, _globals):
        locals_["model"].n_callback_calls += 1

    _, episode_lengths = evaluate_policy(
        model,
        model.get_env(),
        n_eval_episodes,
        deterministic=True,
        render=False,
        callback=dummy_callback,
        reward_threshold=None,
        return_episode_rewards=True,
    )

    n_steps = sum(episode_lengths)
    assert n_steps == n_steps_per_episode * n_eval_episodes
    assert n_steps == model.n_callback_calls

    # Reaching a mean reward of zero is impossible with the Pendulum env
    with pytest.raises(AssertionError):
        evaluate_policy(model, model.get_env(), n_eval_episodes, reward_threshold=0.0)

    episode_rewards, _ = evaluate_policy(model, model.get_env(), n_eval_episodes, return_episode_rewards=True)
    assert len(episode_rewards) == n_eval_episodes
コード例 #5
0
def a2c(env, hyper, policy = "MlpPolicy", tensorboard_log = None, verbose = 1,
        seed = 0, use_sde = True, sde_sample_freq = -1, rms_prop_eps = 1e-05,
        device = "auto"):
   
  lr_schedule = hyper["params_lr_schedule"]
  learning_rate = hyper["params_lr"]
  if lr_schedule == "linear":
    learning_rate = linear_schedule(learning_rate)

  policy_kwargs = make_policy_kwargs(hyper, "a2c")
  model = A2C(policy, 
              env, 
              tensorboard_log=tensorboard_log, 
              verbose = verbose, 
              seed = seed,
              use_sde = use_sde,
              sde_sample_freq = sde_sample_freq,
              rms_prop_eps = rms_prop_eps,
              learning_rate = learning_rate,
              n_steps = np.int(hyper["params_n_steps"]),
              gamma = hyper["params_gamma"],
              gae_lambda = hyper["params_gae_lambda"],
              ent_coef = hyper["params_ent_coef"],
              vf_coef = hyper["params_vf_coef"],
              max_grad_norm = hyper["params_max_grad_norm"],
              use_rms_prop = hyper["params_use_rms_prop"],
              normalize_advantage = hyper["params_normalize_advantage"],
              policy_kwargs = policy_kwargs,
              device = device
          )
  return model
コード例 #6
0
def train(params):

    model = A2C(params.get("policy"),
                multi_env,
                verbose=1,
                tensorboard_log=log_dir,
                learning_rate=params.get("learning_rate"),
                n_steps=params.get("n_steps"),
                gamma=params.get("gamma"),
                gae_lambda=params.get("gae_lambda"),
                ent_coef=params.get("ent_coef"),
                vf_coef=params.get("vf_coef"),
                max_grad_norm=params.get("max_grad_norm"),
                rms_prop_eps=params.get("rms_prop_eps"),
                use_rms_prop=params.get("use_rms_prop"),
                use_sde=params.get("use_sde"),
                sde_sample_freq=params.get("sde_sample_freq"),
                normalize_advantage=params.get("normalize_advantage"),
                policy_kwargs=dict(
                    net_arch=[256, 256, dict(vf=[256], pi=[16])]))

    # Train for 1e5 steps
    model.learn(total_timesteps=params.get("train_steps"), eval_env=env)
    # Save the trained agent
    model.save(exp_name)
コード例 #7
0
def test_eval_friendly_error():
    # tests that eval callback does not crash when given a vector
    train_env = VecNormalize(DummyVecEnv([lambda: gym.make("CartPole-v1")]))
    eval_env = DummyVecEnv([lambda: gym.make("CartPole-v1")])
    eval_env = VecNormalize(eval_env, training=False, norm_reward=False)
    _ = train_env.reset()
    original_obs = train_env.get_original_obs()
    model = A2C("MlpPolicy", train_env, n_steps=50, seed=0)

    eval_callback = EvalCallback(
        eval_env,
        eval_freq=100,
        warn=False,
    )
    model.learn(100, callback=eval_callback)

    # Check synchronization
    assert np.allclose(train_env.normalize_obs(original_obs), eval_env.normalize_obs(original_obs))

    wrong_eval_env = gym.make("CartPole-v1")
    eval_callback = EvalCallback(
        wrong_eval_env,
        eval_freq=100,
        warn=False,
    )

    with pytest.warns(Warning):
        with pytest.raises(AssertionError):
            model.learn(100, callback=eval_callback)
コード例 #8
0
ファイル: test_run.py プロジェクト: mjlbach/stable-baselines3
def test_a2c(env_id):
    model = A2C("MlpPolicy",
                env_id,
                seed=0,
                policy_kwargs=dict(net_arch=[16]),
                verbose=1,
                create_eval_env=True)
    model.learn(total_timesteps=1000, eval_freq=500)
コード例 #9
0
def main():
    #env_id = "CartPole-v1"
    vix_env = trading_vix_env.trading_vix_env()
    num_cpu = 20  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)])

    model = A2C('MlpPolicy', env, verbose=1, n_steps=5)
    model.learn(total_timesteps=2500000000)
コード例 #10
0
def train_A2C(env_train, model_name, timesteps=25000):
    """A2C model"""

    start = time.time()
    model = A2C("MlpPolicy", env_train, verbose=0)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    return model, (end - start) / 60
コード例 #11
0
def dummy_model_distribution_obs_and_actions() -> Tuple[A2C, np.array, np.array]:
    """
    Fixture creating a Pendulum-v1 gym env, an A2C model and sampling 10 random observations and actions from the env
    :return: A2C model, random observations, random actions
    """
    env = gym.make("Pendulum-v1")
    model = A2C("MlpPolicy", env, seed=23)
    random_obs = np.array([env.observation_space.sample() for _ in range(10)])
    random_actions = np.array([env.action_space.sample() for _ in range(10)])
    return model, random_obs, random_actions
コード例 #12
0
 def train(time_steps, save=False, **params):
     env = A2CAgent.create_env(1)
     model = A2C('CnnPolicy',
                 env,
                 verbose=params.get('verbose', 1),
                 tensorboard_log=TB_LOGS,
                 ent_coef=0.01)
     model.learn(total_timesteps=time_steps)
     if save:
         model.save(MODEL_PATH)
コード例 #13
0
def train_A2C(env_train, model_name, timesteps=25000):
    """A2C model"""

    start = time.time()
    model = A2C('MlpPolicy', env_train, verbose=0)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (A2C): ', (end - start) / 60, ' minutes')
    return model
コード例 #14
0
    def __init__(self, env, hyperparameters=DEFAULT_HYPERPARAMETERS):
        self.P = hyperparameters

        if self.P["model_class"] == "dqn":
            from stable_baselines3 import DQN
            self.model = DQN('MlpPolicy', env, verbose=self.P["verbose"])
            self.model_class = DQN

        elif self.P["model_class"] == "a2c":
            from stable_baselines3 import A2C
            from stable_baselines3.a2c import MlpPolicy
            self.model = A2C(MlpPolicy, env, verbose=self.P["verbose"])
            self.model_class = A2C

        elif self.P["model_class"] == "ddpg":
            from stable_baselines3 import DDPG
            from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
            n_actions = env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            self.model = DDPG('MlpPolicy',
                              env,
                              action_noise=action_noise,
                              verbose=self.P["verbose"])
            self.model_class = DDPG

        elif self.P["model_class"] == "td3":
            from stable_baselines3 import TD3
            from stable_baselines3.td3.policies import MlpPolicy
            from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
            n_actions = env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            self.model = TD3(MlpPolicy,
                             env,
                             action_noise=action_noise,
                             verbose=self.P["verbose"])
            self.model_class = TD3

        elif self.P["model_class"] == "ppo":
            from stable_baselines3 import PPO
            from stable_baselines3.ppo import MlpPolicy
            self.model = PPO(MlpPolicy, env, verbose=self.P["verbose"])
            self.model_class = PPO

        elif self.P["model_class"] == "sac":
            from stable_baselines3 import SAC
            from stable_baselines3.sac import MlpPolicy
            self.model = SAC(MlpPolicy, env, verbose=self.P["verbose"])
            self.model_class = SAC

        else:
            raise NotImplementedError()
コード例 #15
0
def getting_started():
    env = gym.make("CartPole-v1")
    model = A2C("MlpPolicy", env, verbose=1)
    #model = A2C("MlpPolicy", "CartPole-v1", verbose=1)

    model.learn(total_timesteps=10000)

    obs = env.reset()
    for i in range(1000):
        action, _state = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        env.render()
        if done:
            obs = env.reset()
コード例 #16
0
def test_eval_callback_vec_env():
    # tests that eval callback does not crash when given a vector
    n_eval_envs = 3
    train_env = IdentityEnv()
    eval_env = DummyVecEnv([lambda: IdentityEnv()] * n_eval_envs)
    model = A2C("MlpPolicy", train_env, seed=0)

    eval_callback = EvalCallback(
        eval_env,
        eval_freq=100,
        warn=False,
    )
    model.learn(300, callback=eval_callback)
    assert eval_callback.last_mean_reward == 100.0
コード例 #17
0
 def test_multiple_stable_baselines(self):
     env_name = "CartPole-v0"
     env = gym.make(env_name)
     models = [
         DQN("MlpPolicy", gym.make(env_name), learning_rate=1e-3),
         A2C(policy="MlpPolicy", env=gym.make(env_name), verbose=1),
         PPO(policy="MlpPolicy", env=gym.make(env_name), verbose=1),
     ]
     model_names = ["Simple DQN", "A2C", "PPO"]
     train_multiple(models, env, 1470, 195, model_names, 200)
     trained_env = get_saved_environments()[0]
     trained_models = get_trained_model_names(trained_env)
     model_saved = set(model_names) == set(trained_models)
     shutil.rmtree(save_path)
     self.assertTrue(model_saved)
コード例 #18
0
def atari_games_example():
    # There already exists an environment generator that will make and wrap atari environments correctly.
    # Here we are also multi-worker training (n_envs=4 => 4 environments).
    env = make_atari_env("PongNoFrameskip-v4", n_envs=4, seed=0)
    # Frame-stacking with 4 frames.
    env = VecFrameStack(env, n_stack=4)

    model = A2C("CnnPolicy", env, verbose=1)
    model.learn(total_timesteps=25_000)

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
コード例 #19
0
    def train(self, timesteps=10000, continue_training=False):
        start_time = time.time()
        if not continue_training:
            print("Initializing from scratch")
            model = A2C(self.policy_name,
                        self.env,
                        verbose=1,
                        tensorboard_log=self.log_dir)
        else:
            model = self.load_model()
            print("Restored from {}".format(self.model_path))

        model.learn(total_timesteps=timesteps)
        print('\nTraining complete. Time taken = {} secs'.format(time.time() -
                                                                 start_time))
        model.save(self.model_path)
コード例 #20
0
def main():
    #env_id = "CartPole-v1"
    vix_env = trading_vix_env.trading_vix_env()
    num_cpu = 20  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)])

    # Create log dir
    log_dir = './a2c_data'
    os.makedirs(log_dir, exist_ok=True)
    env = VecMonitor(env, log_dir)
    callback = custom_call_back.CustomCallback(check_freq=1000,
                                               log_dir=log_dir)

    model = A2C('MlpPolicy', env, verbose=1, n_steps=5)
    model.learn(total_timesteps=2500000000, callback=callback)
コード例 #21
0
ファイル: example.py プロジェクト: benallard/quarto-gym
def a2c(path):
    env = make_env(HumanPlayer())

    eval_env = make_env(RandomPlayer())

    model = A2C.load(path, env, verbose=1)

    mean, std = evaluate_policy(model, eval_env, n_eval_episodes=10)
    print(f"Loaded policy: mean={mean:.2f} +/- {std}")
    # Show how well we learned by plating a game:
    obs = env.reset()
    done = False
    while not done:
        action, _state = model.predict(obs)
        obs, reward, done, info = env.step(action)
        print(f"{info['turn']: <4} | Reward: {reward: >4} | {info['winner']}")
        env.render()
    print("done")
コード例 #22
0
ファイル: cupsworld_dqn.py プロジェクト: spotter-ai/spotter
def train(env, type, timesteps):
    env.reset()
    print(check_env(env))
    env = FlattenObservation(env)
    print(env.reward_range)
    print(env.action_space)
    if type == "DQN":
        model = DQN('MlpPolicy',
                    exploration_fraction=0.999,
                    env=env,
                    verbose=1)
    elif type == "A2C":
        model = A2C('MlpPolicy', env=env, verbose=1)
    elif type == "PPO":
        model = PPO('MlpPolicy', env=env, verbose=1)

    model.learn(total_timesteps=timesteps)
    model.save("model_cups")
コード例 #23
0
def train_rl_agent(ticker):
    # initialize training structures
    train_data_path = '../data/{}_train.csv'.format(ticker.lower())
    train_data = pd.read_csv(train_data_path)
    env = SingleStockTradingEnv(train_data_path,
                                engineer_features,
                                initial_value=INITIAL_PORTFOLIO_VALUE,
                                borrowing=BORROWING,
                                long_only=LONG_ONLY)

    # create and train agent
    agent = A2C('MlpPolicy', env, gamma=0.1)
    for i in range(10):
        print(ticker, i, env.data.shape[0])
        env.reset()
        agent.learn(env.data.shape[0]
                    )  # go through whole history based on each training run
    agent.save('checkpoints/{}_rl_no_restrictions'.format(ticker.lower()))
コード例 #24
0
    def train_A2C(self, model_name, model_params = A2C_PARAMS):
        """A2C model"""
        from stable_baselines3 import A2C
        env_train = self.env
        start = time.time()
        model = A2C('MlpPolicy', env_train, 
                    n_steps = model_params['n_steps'],
                    ent_coef = model_params['ent_coef'],
                    learning_rate = model_params['learning_rate'],
                    verbose = model_params['verbose'],
                    tensorboard_log = f"{zvt_env['log_path']}/{model_name}"
                    )
        model.learn(total_timesteps=model_params['timesteps'], tb_log_name = "A2C_run")
        end = time.time()

        model.save(f"{zvt_env['model_path']}/{model_name}")
        print('Training time (A2C): ', (end-start)/60,' minutes')
        return model
コード例 #25
0
def evaluate(params):

    # Load saved model
    model = A2C.load(exp_name, env=env)
    results = np.zeros(shape=(0,0))
    obs = env.reset()

    # Evaluate the agent
    episode_reward = 0
    for _ in range(params.get("test_episodes")):
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        episode_reward += reward
        if done or info.get('is_success', False):
            episode_reward = 0.0
            obs = env.reset()

        result = ("Reward:", episode_reward, "Success?", info.get('is_success', True))
        results = np.append(results, result, axis=None)
コード例 #26
0
ファイル: model.py プロジェクト: wesleyyuan17/6104-Project
    def __init__(self, model='a2c', use_gp=False, gp_params=None, **kwargs):
        # wrapper around stable_baselines RL implemenetations
        assert model in ACCEPTED_MODELS, 'Unknown RL model, must be in {}'.format(ACCEPTED_MODELS)
        if model == 'a2c':
            self.rl = A2C(**kwargs)
        elif model == 'ppo':
            self.rl = PPO(**kwargs)
        elif model == 'dqn':
            self.rl = DQN(**kwargs)
        elif model == 'td3':
            self.rl = TD3(**kwargs)

        self.use_gp = use_gp
        if self.use_gp:
            assert gp_params is not None, 'Must provide parameters such as training data, number of iterations, etc. for GPR'
            self.n_train = gp_params['n_train']
            self.retraining_iter = gp_params['training_iter']
            self.cvar_limit = gp_params['cvar_limit']
            self.gp_limit = gp_params['gp_limit']

            self.likelihood = gpytorch.likelihoods.GaussianLikelihood()
            if 'data' in gp_params.keys():
                self.X_train = gp_params['data']['X_train']
                self.y_train = gp_params['data']['y_train']
            else:
                self.X_train = torch.zeros(self.n_train, kwargs['env'].num_features) # hard coded to match dimensions of features
                self.y_train = torch.zeros(self.n_train)
            self.gp = ExactGPModel(self.X_train, self.y_train, self.likelihood)
            self.mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.gp)
            self.opt = torch.optim.Adam(self.gp.parameters(), lr=0.1)

            self.shares = 0
            self.cash = 0
            self.obs = [] # holds up to 2 past observations, helps in keeping X, y aligned

            # for plotting
            self.pred_return = 0
            self.pred_lower = 0
            self.pred_upper = 0

            # for debugging
            self.goal_num_shares = 0
コード例 #27
0
    def train_A2C(self, model_name, model_params = config.A2C_PARAMS):
        """A2C model"""
        from stable_baselines3 import A2C
        from stable_baselines3.a2c import MlpPolicy

        env_train = self.env
        start = time.time()
        model = A2C('MlpPolicy', env_train, 
                    n_steps = model_params['n_steps'],
                    ent_coef = model_params['ent_coef'],
                    learning_rate = model_params['learning_rate'],
                    verbose = model_params['verbose'],
                    tensorboard_log = f"{config.TENSORBOARD_LOG_DIR}/{model_name}"
                    )
        model.learn(total_timesteps=model_params['timesteps'], tb_log_name = "A2C_run")
        end = time.time()

        model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
        print('Training time (A2C): ', (end-start)/60,' minutes')
        return model
コード例 #28
0
def test_set_logger(tmp_path):
    # set up logger
    new_logger = configure(str(tmp_path), ["stdout", "csv", "tensorboard"])
    # Default outputs with verbose=0
    model = A2C("MlpPolicy", "CartPole-v1", verbose=0).learn(4)
    assert model.logger.output_formats == []

    model = A2C("MlpPolicy",
                "CartPole-v1",
                verbose=0,
                tensorboard_log=str(tmp_path)).learn(4)
    assert str(tmp_path) in model.logger.dir
    assert isinstance(model.logger.output_formats[0], TensorBoardOutputFormat)

    # Check that env variable work
    new_tmp_path = str(tmp_path / "new_tmp")
    os.environ["SB3_LOGDIR"] = new_tmp_path
    model = A2C("MlpPolicy", "CartPole-v1", verbose=0).learn(4)
    assert model.logger.dir == new_tmp_path

    # Default outputs with verbose=1
    model = A2C("MlpPolicy", "CartPole-v1", verbose=1).learn(4)
    assert isinstance(model.logger.output_formats[0], HumanOutputFormat)
    # with tensorboard
    model = A2C("MlpPolicy",
                "CartPole-v1",
                verbose=1,
                tensorboard_log=str(tmp_path)).learn(4)
    assert isinstance(model.logger.output_formats[0], HumanOutputFormat)
    assert isinstance(model.logger.output_formats[1], TensorBoardOutputFormat)
    assert len(model.logger.output_formats) == 2
    model.learn(32)
    # set new logger
    model.set_logger(new_logger)
    # Check that the new logger is correctly setup
    assert isinstance(model.logger.output_formats[0], HumanOutputFormat)
    assert isinstance(model.logger.output_formats[1], CSVOutputFormat)
    assert isinstance(model.logger.output_formats[2], TensorBoardOutputFormat)
    assert len(model.logger.output_formats) == 3
    model.learn(32)

    model = A2C("MlpPolicy", "CartPole-v1", verbose=1)
    model.set_logger(new_logger)
    model.learn(32)
    # Check that the new logger is not overwritten
    assert isinstance(model.logger.output_formats[0], HumanOutputFormat)
    assert isinstance(model.logger.output_formats[1], CSVOutputFormat)
    assert isinstance(model.logger.output_formats[2], TensorBoardOutputFormat)
    assert len(model.logger.output_formats) == 3
コード例 #29
0
def test_evaluate_vector_env(n_envs):
    # Tests that the number of episodes evaluated is correct
    n_eval_episodes = 6

    env = make_vec_env("CartPole-v1", n_envs)
    model = A2C("MlpPolicy", "CartPole-v1", seed=0)

    class CountCallback:
        def __init__(self):
            self.count = 0

        def __call__(self, locals_, globals_):
            if locals_["done"]:
                self.count += 1

    count_callback = CountCallback()

    evaluate_policy(model, env, n_eval_episodes, callback=count_callback)

    assert count_callback.count == n_eval_episodes
コード例 #30
0
def make_gif_example():
    # Make a GIF of a Trained Agent.

    import imageio

    model = A2C("MlpPolicy", "LunarLander-v2").learn(100_000)

    images = []
    obs = model.env.reset()
    img = model.env.render(mode="rgb_array")
    for i in range(350):
        images.append(img)
        action, _ = model.predict(obs)
        obs, _, _, _ = model.env.step(action)
        img = model.env.render(mode="rgb_array")

    imageio.mimsave(
        "lander_a2c.gif",
        [np.array(img) for i, img in enumerate(images) if i % 2 == 0],
        fps=29)