Esempio n. 1
0
def run(learning_steps=4300,
        verbose=0,
        n_steps=20,
        gamma=0.99,
        learning_rate=7e-4,
        ent_coef=0.01,
        tensorboard_log="tensorboard"):
    global inner_env
    inner_env = gym.make(
        'gym_threshold:extended-state-semi-fixed-end-not-adapted-v0')
    env = DummyVecEnv([lambda: inner_env])

    model = ACER(MlpPolicy,
                 env,
                 verbose=verbose,
                 n_steps=n_steps,
                 gamma=gamma,
                 ent_coef=ent_coef,
                 learning_rate=learning_rate,
                 tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=learning_steps,
                tb_log_name=os.path.basename(__file__).rstrip(".py"),
                callback=tensorboard_callback)

    env.close()
Esempio n. 2
0
def train():
    """Trains an ACER policy """
    env = create_env()

    model = ACER(policy=CnnPolicy,
                 env=env,
                 gamma=0.99,
                 n_steps=20,
                 num_procs=4,
                 q_coef=0.5,
                 ent_coef=0.01,
                 max_grad_norm=10,
                 learning_rate=0.0007,
                 lr_schedule='linear',
                 rprop_alpha=0.99,
                 rprop_epsilon=1e-05,
                 buffer_size=5000,
                 replay_ratio=4,
                 replay_start=1000,
                 correction_term=10.0,
                 trust_region=True,
                 alpha=0.99,
                 delta=1,
                 verbose=1,
                 tensorboard_log="./tb")

    model.learn(total_timesteps=int(1e7),
                callback=callback,
                tb_log_name="acer")

    model.save("models/pacman_acer.pkl")
Esempio n. 3
0
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu):
    """
    train an ACER model on atari

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param num_cpu: (int) The number of cpu to train on
    """
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = CnnLstmPolicy
    else:
        warnings.warn("Policy {} not implemented".format(policy))
        return

    model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000)
    model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
    env.close()
    # Free memory
    del model
Esempio n. 4
0
def train_ACER(env_train, model_name, timesteps=25000):
    start = time.time()
    model = ACER('MlpPolicy', env_train, verbose=0)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (A2C): ', (end - start) / 60, ' minutes')
    return model
Esempio n. 5
0
def train_acer(timesteps, name):
    env = datares_roulette
    env = DummyVecEnv([env])
    model = ACER(
        stable_baselines.common.policies.MlpPolicy,
        env,
        verbose=1,
    )
    model.learn(total_timesteps=timesteps)
    model.save(name)
    return model
Esempio n. 6
0
def acer(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    model = ACER(MlpPolicy, env, verbose=0)
    # Train the agent
    print("Beginning training episodes with ACER.")
    model.learn(total_timesteps=timesteps)

    env.close()
Esempio n. 7
0
def train_acer(seed):
    """
    test ACER on the uav_env(cartesian,discrete)
    :param seed: random seed
    :return: evaluation
    """
    """
    ACER(policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, ent_coef=0.01,
    max_grad_norm=10, learning_rate=0.0007, lr_schedule='linear', rprop_alpha=0.99,
    rprop_epsilon=1e-05, buffer_size=5000, replay_ratio=4, replay_start=1000, 
    correction_term=10.0, trust_region=True, alpha=0.99, delta=1, verbose=0, 
    tensorboard_log=None, _init_setup_model=True)
    """
    algo = 'ACER'
    num_timesteps = 3000000

    env = set_up_env(seed)

    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0

    model = ACER(policy=MlpPolicy, env=env, gamma=0.99, n_steps=20, num_procs=1,
                 q_coef=0.5, ent_coef=0.01, max_grad_norm=10, learning_rate=0.0007,
                 lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-05,
                 buffer_size=5000, replay_ratio=4, replay_start=1000,
                 correction_term=10.0, trust_region=True, alpha=0.99, delta=1,
                 verbose=0, tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo))

    model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed,
                log_interval=500, tb_log_name="seed_{}".format(seed))

    model = ACER.load(log_dir + 'best_model.pkl')

    evaluation = evaluate_model(env, model, 100)
    os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True)
    os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed))
    env.close()
    del model, env
    gc.collect()
    return evaluation
Esempio n. 8
0
from stable_baselines.common.cmd_util import make_atari_env
from stable_baselines.common.vec_env import VecFrameStack
from stable_baselines import ACER

# There already exists an environment generator
# that will make and wrap atari environments correctly.
# Here we are also multiprocessing training (num_env=4 => 4 processes)
env = make_atari_env('PongNoFrameskip-v4', num_env=4, seed=0)
# Frame-stacking with 4 frames
env = VecFrameStack(env, n_stack=4)

model = ACER('CnnPolicy', env, verbose=1)
model.learn(total_timesteps=25000)

# save
model.save("cnn_pong")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Esempio n. 9
0
class ACERAgent(Agent):
    def __init__(
        self,
        model_name="model_name",
        save_dir="./models",
        log_interval=1e4,
        num_cpus=8,
        eval_episodes=1000,
        n_steps=1e6,
        layer_normalization=False,
        model_kwargs={"tensorboard_log": "./tensorboards/"},
        env_kwargs={
            "board_size": 4,
            "binary": True,
            "extractor": "cnn"
        },
        callback_checkpoint_kwargs={
            "save_freq": 0,
            "save_path": "./models/",
            "name_prefix": "model_name"
        },
        callback_hist_kwargs={"hist_freq": 0},
    ):
        super().__init__(
            model_name,
            save_dir,
            num_cpus,
            model_kwargs,
            env_kwargs,
            layer_normalization,
            callback_checkpoint_kwargs,
            callback_hist_kwargs,
            n_steps,
            log_interval,
            eval_episodes,
        )
        self._init_model()

    def _init_model(self):
        if not self._model_kwargs["agent"].lower() == "acer":
            raise ValueError(
                "The model_kwargs dict has to be created using args from  ACER agent as reference. Make sure the correct parameters models."
            )

        del self._model_kwargs["agent"]

        self._callback_checkpoint_kwargs["save_freq"] = int(
            self._callback_checkpoint_kwargs["save_freq"] / self._num_cpus)

        if self._env_kwargs["extractor"] == "mlp":
            self._model = ACER(CustomMlpPolicy, self._env,
                               **self._model_kwargs)
        else:
            self._model = ACER(CustomCnnPolicy, self._env,
                               **self._model_kwargs)

    def train(self):
        "Optimize the model."
        callbacks = []

        # Checkpoint callback
        if self._callback_checkpoint_kwargs["save_freq"] > 0:

            # Append model name into checkpoint save_path
            self._callback_checkpoint_kwargs["save_path"] = (
                self._callback_checkpoint_kwargs["save_path"] + "/" +
                str(self._model_name))
            checkpoint_callback = CheckpointCallback(
                **self._callback_checkpoint_kwargs)
            callbacks.append(checkpoint_callback)

        if self._callback_hist_kwargs["hist_freq"] > 0:
            # hist_callback = CustomCallbackPPO2(**self._callback_hist_kwargs)
            # callbacks.append(hist_callback)
            pass

        try:
            self._model.learn(self._n_steps,
                              log_interval=self._log_interval,
                              callback=callbacks,
                              tb_log_name=self._model_name)
        except KeyboardInterrupt:
            pass

        folder_path = os.path.join(self._save_dir, self._model_name)
        self._model.save(os.path.join(folder_path, self._model_name))

    def test(self):
        "Evaluate the model."

        mean_reward = super()._test(self._model)
        return mean_reward
Esempio n. 10
0
import gym
from stable_baselines import ACER
from stable_baselines.common.policies import CnnPolicy
from stable_baselines.common.vec_env import DummyVecEnv

# trying to get an idea of how quickly my computer can train this
pong_env = gym.make('Pong-v0')
pong_env = DummyVecEnv([lambda: pong_env])
pong_model_acer = ACER(
    CnnPolicy,
    pong_env,
    verbose=0,
    tensorboard_log="./../../data/baselines-stuff/pong/acer_pong_tensorboard/")
pong_model_acer.learn(total_timesteps=50_000_000,
                      tb_log_name="run-1-50_000_000")

# since I know I'll be stopping it early
pong_model_acer.save(
    './../../data/baselines-stuff/pong/terrible_pong_model_acer')
Esempio n. 11
0
from time import time, ctime

from timeit import default_timer as timer
from datetime import timedelta

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy
from stable_baselines.common import make_vec_env
from stable_baselines import ACER

# multiprocess environment
env = gym.make('RX_env:RX-v1')

model = ACER(MlpPolicy, env, verbose=1, tensorboard_log="acer_log")

stt = timer()
model.learn(total_timesteps=100000, tb_log_name="first_x_acer")
#model.learn(total_timesteps=1000000, tb_log_name="second_x_a2c", reset_num_timesteps=False)
end = timer()

#model.save("acer_x")
#del model # remove to demonstrate saving and loading
#model = A2C.load("acer_x")

obs = env.reset("f03.jss")
reward = 0
step = 0
while True:
    step += 1
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
Esempio n. 12
0
        if mean_reward > best_mean_reward:
            best_mean_reward = mean_reward
            print("Saving new best model")
            _locals['self'].save(model_directory + 'acer-model_' +
                                 str(n_steps + 1) + '.pkl')
    n_steps += 1
    return True


if __name__ == "__main__":
    os.makedirs(log_directory, exist_ok=True)
    os.makedirs(model_directory, exist_ok=True)

    env = SubprocVecEnv([
        lambda: Monitor(gym.make('gym_building:building-v0',
                                 people=people,
                                 num_of_lift=3,
                                 height_of_building=5),
                        log_directory,
                        allow_early_resets=True) for i in range(4)
    ])

    model = ACER(env=env,
                 policy=MlpLnLstmPolicy,
                 verbose=1,
                 tensorboard_log="./acer_tensorboard/",
                 learning_rate=0.01,
                 lr_schedule='double_linear_con')

    model.learn(total_timesteps=TIMESTEPS, callback=callback)
Esempio n. 13
0
def test_action_mask_learn_acer(vec_env, policy, env_class):
    env = vec_env([env_class]*2)

    model = ACER(policy, env, verbose=0)
    model.learn(total_timesteps=500)
    env.close()
Esempio n. 14
0
def main(argv):
    environmentName = ''
    algorithmName = ''

    # Parse arguments
    try:
        opts, args = getopt.getopt(argv, 'e:a:', ['env=', 'alg='])
    except getopt.GetoptError:
        print('--env <environment-name> --alg <algorithm-name>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-e', '--env'):
            environmentName = arg
        elif opt in ('-a', '--alg'):
            algorithmName = arg

    # Create environment
    env = gym.make(environmentName)

    # Create model
    if algorithmName == 'A2C':
        model = A2C('MlpPolicy',
                    env,
                    verbose=1,
                    tensorboard_log=getTensorboardLogLocation(
                        environmentName, algorithmName),
                    full_tensorboard_log=False)
    elif algorithmName == 'ACER':
        model = ACER('MlpPolicy',
                     env,
                     verbose=1,
                     tensorboard_log=getTensorboardLogLocation(
                         environmentName, algorithmName),
                     full_tensorboard_log=False)
    elif algorithmName == 'ACKTR':
        model = ACKTR('MlpPolicy',
                      env,
                      verbose=1,
                      tensorboard_log=getTensorboardLogLocation(
                          environmentName, algorithmName),
                      full_tensorboard_log=False)
    elif algorithmName == 'DDPG':
        model = DDPG('MlpPolicy',
                     env,
                     verbose=1,
                     tensorboard_log=getTensorboardLogLocation(
                         environmentName, algorithmName),
                     full_tensorboard_log=False)
    elif algorithmName == 'DQN':
        model = DQN('MlpPolicy',
                    env,
                    verbose=1,
                    tensorboard_log=getTensorboardLogLocation(
                        environmentName, algorithmName),
                    full_tensorboard_log=False)
    elif algorithmName == 'PPO':
        model = PPO2('MlpPolicy',
                     env,
                     verbose=1,
                     tensorboard_log=getTensorboardLogLocation(
                         environmentName, algorithmName),
                     full_tensorboard_log=False)
    elif algorithmName == 'SAC':
        model = SAC('MlpPolicy',
                    env,
                    verbose=1,
                    tensorboard_log=getTensorboardLogLocation(
                        environmentName, algorithmName),
                    full_tensorboard_log=False)
    elif algorithmName == 'TD3':
        model = TD3('MlpPolicy',
                    env,
                    verbose=1,
                    tensorboard_log=getTensorboardLogLocation(
                        environmentName, algorithmName),
                    full_tensorboard_log=False)
    elif algorithmName == 'TRPO':
        model = TRPO('MlpPolicy',
                     env,
                     verbose=1,
                     tensorboard_log=getTensorboardLogLocation(
                         environmentName, algorithmName),
                     full_tensorboard_log=False)
    else:
        print('Wrong algorithm')
        sys.exit(2)

    model.learn(total_timesteps=int(STEPS), log_interval=250)

    print('Trained algorithm:')
    print(environmentName, algorithmName)
Esempio n. 15
0
env = make_vec_env(RPiLEDEnv, env_kwargs=envArgsDict)

callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-20,
                                                 verbose=1)

eval_callback = EvalCallback(env,
                             best_model_save_path='./logs/best',
                             log_path='./logs/',
                             eval_freq=5000,
                             deterministic=True,
                             render=False,
                             callback_on_new_best=callback_on_best)

# Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :(
checkpoint_callback = CheckpointCallback(save_freq=1000,
                                         save_path='./logs/',
                                         name_prefix='ppo1_model')

cb = CallbackList([checkpoint_callback, eval_callback])

policy_kwargs = {'layers': [128]}

model = ACER(MlpLnLstmPolicy,
             env,
             verbose=1,
             policy_kwargs=policy_kwargs,
             tensorboard_log='./logs/')
model.learn(total_timesteps=10000, callback=cb)
model.save('acer_rpi_lid')
print('model saved')
Esempio n. 16
0
    num_cpu = 15  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)])
    eval_env=environment(x,y,z,gamma)
    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)
    scenario=str(f'{inputfile_s}_t{test}_lr{LR_s}_gamma{gamma_s}_batch{batch_size}')    
    callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=scenario, n_eval_episodes=20
                                                                         , deterministic=False, best_model_save_path=scenario)])
    

        
    model = ACER(MlpPolicy, env, gamma=gamma, n_steps=batch_size, learning_rate=LR,  verbose=1, lr_schedule='constant')#, tensorboard_log=scenario)
    model.learn(total_timesteps=episodetimesteps**99, callback=callbacklist)
    
    
    filename= './%s/evaluations.npz' % scenario
    
    data=np.load(filename)
    results=data['results']
    y=np.average(results, axis=1)
    timesteps=data['timesteps']
    plt.plot(timesteps,y)
    
    plt.xlabel('Timesteps')
    plt.ylabel('Score')
    #plt.show()
    
    savepath='./%s/fig_%s' % (scenario, scenario)
Esempio n. 17
0
def ttest_env(modelpath, modelname):
    for name in modelpath:
        os.makedirs(name, exist_ok=True)
        env = IdentityEnv(18, 18, 60)
        env = Monitor(env, name)
        e = DummyVecEnv([lambda: env])
        if name == log_dir_a2c:
            model = A2C(policy="MlpPolicy", env=e, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "a2c Monitor")
            plt.show()
        if name == log_dir_acer:
            model = ACER(policy="MlpPolicy", env=env, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "acer Monitor")
            plt.show()
        if name == log_dir_acktr:
            model = ACKTR(policy="MlpPolicy", env=env, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "ACKTR Monitor")
            plt.show()
        if name == log_dir_dqn:
            model = DQN(policy="MlpPolicy", env=env, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "DQN Monitor")
            plt.show()
        if name == log_dir_ppo1:
            model = PPO1(policy="MlpPolicy", env=env, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "PPO1 Monitor")
            plt.show()
        if name == log_dir_poo2:
            model = PPO2(policy="MlpPolicy", env=env, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "PPO2 Monitor")
            plt.show()
        if name == log_dir_trpo:
            model = TRPO(policy="MlpPolicy", env=env, verbose=0)
            callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                        log_dir=name)
            time_steps = 1e5
            model.learn(total_timesteps=int(time_steps), callback=callback)
            results_plotter.plot_results([name], time_steps,
                                         results_plotter.X_EPISODES,
                                         "TRPO Monitor")
            plt.show()
    if (os.path.exists("%s/final_model.zip" % savepath)):
        # Instantiate the agent
        model = ACER(policy,
                     env,
                     gamma=gamma,
                     n_steps=episodetimesteps,
                     learning_rate=LR,
                     buffer_size=5000,
                     verbose=1,
                     n_cpu_tf_sess=num_cpu)
        # Load the trained agent
        model = ACER.load("%s/final_model" % savepath, env=env)
        print('loaded agent')
        save_evals()
        model.learn(
            total_timesteps=episodetimesteps**50, callback=callbacklist
        )  #total timesteps set to very large number so program will terminate based on runtime parameter)

    else:
        #create model with Stable Baselines package.
        model = ACER(policy,
                     env,
                     gamma=gamma,
                     n_steps=episodetimesteps,
                     learning_rate=LR,
                     buffer_size=5000,
                     verbose=1,
                     n_cpu_tf_sess=num_cpu)  #, tensorboard_log=scenario)
        #model = ACER.load("%s/best_model" % savepath, env)
        save_evals()
        model.learn(
Esempio n. 19
0
from pathlib import Path
from freqtrade.configuration import Configuration

config = Configuration.from_files(['config_rl.json'])

from freqtradegym import TradingEnv
from stable_baselines.common.policies import MlpPolicy

from stable_baselines import ACER

if __name__ == "__main__":

    env = TradingEnv(config)
    policy_kwargs = dict(layers=[32, 32])
    model = ACER(MlpPolicy,
                 env,
                 learning_rate=1e-4,
                 policy_kwargs=policy_kwargs,
                 verbose=0,
                 tensorboard_log="./tensorboard/")

    model.learn(total_timesteps=int(1e+6))
    model.save('model')