Example #1
0
def train():
    """Trains an ACER policy """
    env = create_env()

    model = ACER(policy=CnnPolicy,
                 env=env,
                 gamma=0.99,
                 n_steps=20,
                 num_procs=4,
                 q_coef=0.5,
                 ent_coef=0.01,
                 max_grad_norm=10,
                 learning_rate=0.0007,
                 lr_schedule='linear',
                 rprop_alpha=0.99,
                 rprop_epsilon=1e-05,
                 buffer_size=5000,
                 replay_ratio=4,
                 replay_start=1000,
                 correction_term=10.0,
                 trust_region=True,
                 alpha=0.99,
                 delta=1,
                 verbose=1,
                 tensorboard_log="./tb")

    model.learn(total_timesteps=int(1e7),
                callback=callback,
                tb_log_name="acer")

    model.save("models/pacman_acer.pkl")
Example #2
0
def train_ACER(env_train, model_name, timesteps=25000):
    start = time.time()
    model = ACER('MlpPolicy', env_train, verbose=0)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (A2C): ', (end - start) / 60, ' minutes')
    return model
Example #3
0
def train_acer(timesteps, name):
    env = datares_roulette
    env = DummyVecEnv([env])
    model = ACER(
        stable_baselines.common.policies.MlpPolicy,
        env,
        verbose=1,
    )
    model.learn(total_timesteps=timesteps)
    model.save(name)
    return model
Example #4
0
from stable_baselines.common.cmd_util import make_atari_env
from stable_baselines.common.vec_env import VecFrameStack
from stable_baselines import ACER

# There already exists an environment generator
# that will make and wrap atari environments correctly.
# Here we are also multiprocessing training (num_env=4 => 4 processes)
env = make_atari_env('PongNoFrameskip-v4', num_env=4, seed=0)
# Frame-stacking with 4 frames
env = VecFrameStack(env, n_stack=4)

model = ACER('CnnPolicy', env, verbose=1)
model.learn(total_timesteps=25000)

# save
model.save("cnn_pong")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Example #5
0
class ACERAgent(Agent):
    def __init__(
        self,
        model_name="model_name",
        save_dir="./models",
        log_interval=1e4,
        num_cpus=8,
        eval_episodes=1000,
        n_steps=1e6,
        layer_normalization=False,
        model_kwargs={"tensorboard_log": "./tensorboards/"},
        env_kwargs={
            "board_size": 4,
            "binary": True,
            "extractor": "cnn"
        },
        callback_checkpoint_kwargs={
            "save_freq": 0,
            "save_path": "./models/",
            "name_prefix": "model_name"
        },
        callback_hist_kwargs={"hist_freq": 0},
    ):
        super().__init__(
            model_name,
            save_dir,
            num_cpus,
            model_kwargs,
            env_kwargs,
            layer_normalization,
            callback_checkpoint_kwargs,
            callback_hist_kwargs,
            n_steps,
            log_interval,
            eval_episodes,
        )
        self._init_model()

    def _init_model(self):
        if not self._model_kwargs["agent"].lower() == "acer":
            raise ValueError(
                "The model_kwargs dict has to be created using args from  ACER agent as reference. Make sure the correct parameters models."
            )

        del self._model_kwargs["agent"]

        self._callback_checkpoint_kwargs["save_freq"] = int(
            self._callback_checkpoint_kwargs["save_freq"] / self._num_cpus)

        if self._env_kwargs["extractor"] == "mlp":
            self._model = ACER(CustomMlpPolicy, self._env,
                               **self._model_kwargs)
        else:
            self._model = ACER(CustomCnnPolicy, self._env,
                               **self._model_kwargs)

    def train(self):
        "Optimize the model."
        callbacks = []

        # Checkpoint callback
        if self._callback_checkpoint_kwargs["save_freq"] > 0:

            # Append model name into checkpoint save_path
            self._callback_checkpoint_kwargs["save_path"] = (
                self._callback_checkpoint_kwargs["save_path"] + "/" +
                str(self._model_name))
            checkpoint_callback = CheckpointCallback(
                **self._callback_checkpoint_kwargs)
            callbacks.append(checkpoint_callback)

        if self._callback_hist_kwargs["hist_freq"] > 0:
            # hist_callback = CustomCallbackPPO2(**self._callback_hist_kwargs)
            # callbacks.append(hist_callback)
            pass

        try:
            self._model.learn(self._n_steps,
                              log_interval=self._log_interval,
                              callback=callbacks,
                              tb_log_name=self._model_name)
        except KeyboardInterrupt:
            pass

        folder_path = os.path.join(self._save_dir, self._model_name)
        self._model.save(os.path.join(folder_path, self._model_name))

    def test(self):
        "Evaluate the model."

        mean_reward = super()._test(self._model)
        return mean_reward
Example #6
0
import gym
from stable_baselines import ACER
from stable_baselines.common.policies import CnnPolicy
from stable_baselines.common.vec_env import DummyVecEnv

# trying to get an idea of how quickly my computer can train this
pong_env = gym.make('Pong-v0')
pong_env = DummyVecEnv([lambda: pong_env])
pong_model_acer = ACER(
    CnnPolicy,
    pong_env,
    verbose=0,
    tensorboard_log="./../../data/baselines-stuff/pong/acer_pong_tensorboard/")
pong_model_acer.learn(total_timesteps=50_000_000,
                      tb_log_name="run-1-50_000_000")

# since I know I'll be stopping it early
pong_model_acer.save(
    './../../data/baselines-stuff/pong/terrible_pong_model_acer')
Example #7
0
env = make_vec_env(RPiLEDEnv, env_kwargs=envArgsDict)

callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-20,
                                                 verbose=1)

eval_callback = EvalCallback(env,
                             best_model_save_path='./logs/best',
                             log_path='./logs/',
                             eval_freq=5000,
                             deterministic=True,
                             render=False,
                             callback_on_new_best=callback_on_best)

# Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :(
checkpoint_callback = CheckpointCallback(save_freq=1000,
                                         save_path='./logs/',
                                         name_prefix='ppo1_model')

cb = CallbackList([checkpoint_callback, eval_callback])

policy_kwargs = {'layers': [128]}

model = ACER(MlpLnLstmPolicy,
             env,
             verbose=1,
             policy_kwargs=policy_kwargs,
             tensorboard_log='./logs/')
model.learn(total_timesteps=10000, callback=cb)
model.save('acer_rpi_lid')
print('model saved')
    env_id = "/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux"
    #env = UnityEnv(env_id, worker_id=2, use_visual=False)
    # Create log dir
    time_int = int(time.time())
    log_dir = "stable_results/basic_env_{}/".format(time_int)
    os.makedirs(log_dir, exist_ok=True)

    #env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run
    num_env = 2
    worker_id = 9
    env = SubprocVecEnv(
        [make_env(env_id, log_dir, i + worker_id) for i in range(num_env)])

    model = ACER(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=30000)
    model.save(log_dir + "model")

    #evaluate agent
    episodes = 100
    ep_r = []
    ep_l = []
    for e in range(episodes):
        obs = env.reset()
        total_r = 0.
        total_l = 0.
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, infos = env.step(action)
            total_l += 1.
            total_r += rewards[0]
            if dones[0]:
Example #9
0
from pathlib import Path
from freqtrade.configuration import Configuration

config = Configuration.from_files(['config_rl.json'])

from freqtradegym import TradingEnv
from stable_baselines.common.policies import MlpPolicy

from stable_baselines import ACER

if __name__ == "__main__":

    env = TradingEnv(config)
    policy_kwargs = dict(layers=[32, 32])
    model = ACER(MlpPolicy,
                 env,
                 learning_rate=1e-4,
                 policy_kwargs=policy_kwargs,
                 verbose=0,
                 tensorboard_log="./tensorboard/")

    model.learn(total_timesteps=int(1e+6))
    model.save('model')