def run(learning_steps=4300, verbose=0, n_steps=20, gamma=0.99, learning_rate=7e-4, ent_coef=0.01, tensorboard_log="tensorboard"): global inner_env inner_env = gym.make( 'gym_threshold:extended-state-semi-fixed-end-not-adapted-v0') env = DummyVecEnv([lambda: inner_env]) model = ACER(MlpPolicy, env, verbose=verbose, n_steps=n_steps, gamma=gamma, ent_coef=ent_coef, learning_rate=learning_rate, tensorboard_log=tensorboard_log) model.learn(total_timesteps=learning_steps, tb_log_name=os.path.basename(__file__).rstrip(".py"), callback=tensorboard_callback) env.close()
def train(): """Trains an ACER policy """ env = create_env() model = ACER(policy=CnnPolicy, env=env, gamma=0.99, n_steps=20, num_procs=4, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, learning_rate=0.0007, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-05, buffer_size=5000, replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1, verbose=1, tensorboard_log="./tb") model.learn(total_timesteps=int(1e7), callback=callback, tb_log_name="acer") model.save("models/pacman_acer.pkl")
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu): """ train an ACER model on atari :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param num_cpu: (int) The number of cpu to train on """ env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = CnnLstmPolicy else: warnings.warn("Policy {} not implemented".format(policy)) return model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000) model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) env.close() # Free memory del model
def train_ACER(env_train, model_name, timesteps=25000): start = time.time() model = ACER('MlpPolicy', env_train, verbose=0) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (A2C): ', (end - start) / 60, ' minutes') return model
def train_acer(timesteps, name): env = datares_roulette env = DummyVecEnv([env]) model = ACER( stable_baselines.common.policies.MlpPolicy, env, verbose=1, ) model.learn(total_timesteps=timesteps) model.save(name) return model
def acer(env_id, log_dir, timesteps): # Create log dir os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_id) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = ACER(MlpPolicy, env, verbose=0) # Train the agent print("Beginning training episodes with ACER.") model.learn(total_timesteps=timesteps) env.close()
def train_acer(seed): """ test ACER on the uav_env(cartesian,discrete) :param seed: random seed :return: evaluation """ """ ACER(policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, learning_rate=0.0007, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-05, buffer_size=5000, replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1, verbose=0, tensorboard_log=None, _init_setup_model=True) """ algo = 'ACER' num_timesteps = 3000000 env = set_up_env(seed) global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 model = ACER(policy=MlpPolicy, env=env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, learning_rate=0.0007, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-05, buffer_size=5000, replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1, verbose=0, tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo)) model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed, log_interval=500, tb_log_name="seed_{}".format(seed)) model = ACER.load(log_dir + 'best_model.pkl') evaluation = evaluate_model(env, model, 100) os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True) os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed)) env.close() del model, env gc.collect() return evaluation
from stable_baselines.common.cmd_util import make_atari_env from stable_baselines.common.vec_env import VecFrameStack from stable_baselines import ACER # There already exists an environment generator # that will make and wrap atari environments correctly. # Here we are also multiprocessing training (num_env=4 => 4 processes) env = make_atari_env('PongNoFrameskip-v4', num_env=4, seed=0) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) model = ACER('CnnPolicy', env, verbose=1) model.learn(total_timesteps=25000) # save model.save("cnn_pong") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
class ACERAgent(Agent): def __init__( self, model_name="model_name", save_dir="./models", log_interval=1e4, num_cpus=8, eval_episodes=1000, n_steps=1e6, layer_normalization=False, model_kwargs={"tensorboard_log": "./tensorboards/"}, env_kwargs={ "board_size": 4, "binary": True, "extractor": "cnn" }, callback_checkpoint_kwargs={ "save_freq": 0, "save_path": "./models/", "name_prefix": "model_name" }, callback_hist_kwargs={"hist_freq": 0}, ): super().__init__( model_name, save_dir, num_cpus, model_kwargs, env_kwargs, layer_normalization, callback_checkpoint_kwargs, callback_hist_kwargs, n_steps, log_interval, eval_episodes, ) self._init_model() def _init_model(self): if not self._model_kwargs["agent"].lower() == "acer": raise ValueError( "The model_kwargs dict has to be created using args from ACER agent as reference. Make sure the correct parameters models." ) del self._model_kwargs["agent"] self._callback_checkpoint_kwargs["save_freq"] = int( self._callback_checkpoint_kwargs["save_freq"] / self._num_cpus) if self._env_kwargs["extractor"] == "mlp": self._model = ACER(CustomMlpPolicy, self._env, **self._model_kwargs) else: self._model = ACER(CustomCnnPolicy, self._env, **self._model_kwargs) def train(self): "Optimize the model." callbacks = [] # Checkpoint callback if self._callback_checkpoint_kwargs["save_freq"] > 0: # Append model name into checkpoint save_path self._callback_checkpoint_kwargs["save_path"] = ( self._callback_checkpoint_kwargs["save_path"] + "/" + str(self._model_name)) checkpoint_callback = CheckpointCallback( **self._callback_checkpoint_kwargs) callbacks.append(checkpoint_callback) if self._callback_hist_kwargs["hist_freq"] > 0: # hist_callback = CustomCallbackPPO2(**self._callback_hist_kwargs) # callbacks.append(hist_callback) pass try: self._model.learn(self._n_steps, log_interval=self._log_interval, callback=callbacks, tb_log_name=self._model_name) except KeyboardInterrupt: pass folder_path = os.path.join(self._save_dir, self._model_name) self._model.save(os.path.join(folder_path, self._model_name)) def test(self): "Evaluate the model." mean_reward = super()._test(self._model) return mean_reward
import gym from stable_baselines import ACER from stable_baselines.common.policies import CnnPolicy from stable_baselines.common.vec_env import DummyVecEnv # trying to get an idea of how quickly my computer can train this pong_env = gym.make('Pong-v0') pong_env = DummyVecEnv([lambda: pong_env]) pong_model_acer = ACER( CnnPolicy, pong_env, verbose=0, tensorboard_log="./../../data/baselines-stuff/pong/acer_pong_tensorboard/") pong_model_acer.learn(total_timesteps=50_000_000, tb_log_name="run-1-50_000_000") # since I know I'll be stopping it early pong_model_acer.save( './../../data/baselines-stuff/pong/terrible_pong_model_acer')
from time import time, ctime from timeit import default_timer as timer from datetime import timedelta from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy from stable_baselines.common import make_vec_env from stable_baselines import ACER # multiprocess environment env = gym.make('RX_env:RX-v1') model = ACER(MlpPolicy, env, verbose=1, tensorboard_log="acer_log") stt = timer() model.learn(total_timesteps=100000, tb_log_name="first_x_acer") #model.learn(total_timesteps=1000000, tb_log_name="second_x_a2c", reset_num_timesteps=False) end = timer() #model.save("acer_x") #del model # remove to demonstrate saving and loading #model = A2C.load("acer_x") obs = env.reset("f03.jss") reward = 0 step = 0 while True: step += 1 action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action)
if mean_reward > best_mean_reward: best_mean_reward = mean_reward print("Saving new best model") _locals['self'].save(model_directory + 'acer-model_' + str(n_steps + 1) + '.pkl') n_steps += 1 return True if __name__ == "__main__": os.makedirs(log_directory, exist_ok=True) os.makedirs(model_directory, exist_ok=True) env = SubprocVecEnv([ lambda: Monitor(gym.make('gym_building:building-v0', people=people, num_of_lift=3, height_of_building=5), log_directory, allow_early_resets=True) for i in range(4) ]) model = ACER(env=env, policy=MlpLnLstmPolicy, verbose=1, tensorboard_log="./acer_tensorboard/", learning_rate=0.01, lr_schedule='double_linear_con') model.learn(total_timesteps=TIMESTEPS, callback=callback)
def test_action_mask_learn_acer(vec_env, policy, env_class): env = vec_env([env_class]*2) model = ACER(policy, env, verbose=0) model.learn(total_timesteps=500) env.close()
def main(argv): environmentName = '' algorithmName = '' # Parse arguments try: opts, args = getopt.getopt(argv, 'e:a:', ['env=', 'alg=']) except getopt.GetoptError: print('--env <environment-name> --alg <algorithm-name>') sys.exit(2) for opt, arg in opts: if opt in ('-e', '--env'): environmentName = arg elif opt in ('-a', '--alg'): algorithmName = arg # Create environment env = gym.make(environmentName) # Create model if algorithmName == 'A2C': model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=getTensorboardLogLocation( environmentName, algorithmName), full_tensorboard_log=False) elif algorithmName == 'ACER': model = ACER('MlpPolicy', env, verbose=1, tensorboard_log=getTensorboardLogLocation( environmentName, algorithmName), full_tensorboard_log=False) elif algorithmName == 'ACKTR': model = ACKTR('MlpPolicy', env, verbose=1, tensorboard_log=getTensorboardLogLocation( environmentName, algorithmName), full_tensorboard_log=False) elif algorithmName == 'DDPG': model = DDPG('MlpPolicy', env, verbose=1, tensorboard_log=getTensorboardLogLocation( environmentName, algorithmName), full_tensorboard_log=False) elif algorithmName == 'DQN': model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=getTensorboardLogLocation( environmentName, algorithmName), full_tensorboard_log=False) elif algorithmName == 'PPO': model = PPO2('MlpPolicy', env, verbose=1, tensorboard_log=getTensorboardLogLocation( environmentName, algorithmName), full_tensorboard_log=False) elif algorithmName == 'SAC': model = SAC('MlpPolicy', env, verbose=1, tensorboard_log=getTensorboardLogLocation( environmentName, algorithmName), full_tensorboard_log=False) elif algorithmName == 'TD3': model = TD3('MlpPolicy', env, verbose=1, tensorboard_log=getTensorboardLogLocation( environmentName, algorithmName), full_tensorboard_log=False) elif algorithmName == 'TRPO': model = TRPO('MlpPolicy', env, verbose=1, tensorboard_log=getTensorboardLogLocation( environmentName, algorithmName), full_tensorboard_log=False) else: print('Wrong algorithm') sys.exit(2) model.learn(total_timesteps=int(STEPS), log_interval=250) print('Trained algorithm:') print(environmentName, algorithmName)
env = make_vec_env(RPiLEDEnv, env_kwargs=envArgsDict) callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-20, verbose=1) eval_callback = EvalCallback(env, best_model_save_path='./logs/best', log_path='./logs/', eval_freq=5000, deterministic=True, render=False, callback_on_new_best=callback_on_best) # Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :( checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', name_prefix='ppo1_model') cb = CallbackList([checkpoint_callback, eval_callback]) policy_kwargs = {'layers': [128]} model = ACER(MlpLnLstmPolicy, env, verbose=1, policy_kwargs=policy_kwargs, tensorboard_log='./logs/') model.learn(total_timesteps=10000, callback=cb) model.save('acer_rpi_lid') print('model saved')
num_cpu = 15 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(x,y,z, i) for i in range(num_cpu)]) eval_env=environment(x,y,z,gamma) # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) scenario=str(f'{inputfile_s}_t{test}_lr{LR_s}_gamma{gamma_s}_batch{batch_size}') callbacklist=CallbackList([TimeLimit(episodetimesteps), EvalCallback(eval_env, log_path=scenario, n_eval_episodes=20 , deterministic=False, best_model_save_path=scenario)]) model = ACER(MlpPolicy, env, gamma=gamma, n_steps=batch_size, learning_rate=LR, verbose=1, lr_schedule='constant')#, tensorboard_log=scenario) model.learn(total_timesteps=episodetimesteps**99, callback=callbacklist) filename= './%s/evaluations.npz' % scenario data=np.load(filename) results=data['results'] y=np.average(results, axis=1) timesteps=data['timesteps'] plt.plot(timesteps,y) plt.xlabel('Timesteps') plt.ylabel('Score') #plt.show() savepath='./%s/fig_%s' % (scenario, scenario)
def ttest_env(modelpath, modelname): for name in modelpath: os.makedirs(name, exist_ok=True) env = IdentityEnv(18, 18, 60) env = Monitor(env, name) e = DummyVecEnv([lambda: env]) if name == log_dir_a2c: model = A2C(policy="MlpPolicy", env=e, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "a2c Monitor") plt.show() if name == log_dir_acer: model = ACER(policy="MlpPolicy", env=env, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "acer Monitor") plt.show() if name == log_dir_acktr: model = ACKTR(policy="MlpPolicy", env=env, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "ACKTR Monitor") plt.show() if name == log_dir_dqn: model = DQN(policy="MlpPolicy", env=env, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "DQN Monitor") plt.show() if name == log_dir_ppo1: model = PPO1(policy="MlpPolicy", env=env, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "PPO1 Monitor") plt.show() if name == log_dir_poo2: model = PPO2(policy="MlpPolicy", env=env, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "PPO2 Monitor") plt.show() if name == log_dir_trpo: model = TRPO(policy="MlpPolicy", env=env, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=name) time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([name], time_steps, results_plotter.X_EPISODES, "TRPO Monitor") plt.show()
if (os.path.exists("%s/final_model.zip" % savepath)): # Instantiate the agent model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR, buffer_size=5000, verbose=1, n_cpu_tf_sess=num_cpu) # Load the trained agent model = ACER.load("%s/final_model" % savepath, env=env) print('loaded agent') save_evals() model.learn( total_timesteps=episodetimesteps**50, callback=callbacklist ) #total timesteps set to very large number so program will terminate based on runtime parameter) else: #create model with Stable Baselines package. model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR, buffer_size=5000, verbose=1, n_cpu_tf_sess=num_cpu) #, tensorboard_log=scenario) #model = ACER.load("%s/best_model" % savepath, env) save_evals() model.learn(
from pathlib import Path from freqtrade.configuration import Configuration config = Configuration.from_files(['config_rl.json']) from freqtradegym import TradingEnv from stable_baselines.common.policies import MlpPolicy from stable_baselines import ACER if __name__ == "__main__": env = TradingEnv(config) policy_kwargs = dict(layers=[32, 32]) model = ACER(MlpPolicy, env, learning_rate=1e-4, policy_kwargs=policy_kwargs, verbose=0, tensorboard_log="./tensorboard/") model.learn(total_timesteps=int(1e+6)) model.save('model')