def train(): """Trains an ACER policy """ env = create_env() model = ACER(policy=CnnPolicy, env=env, gamma=0.99, n_steps=20, num_procs=4, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, learning_rate=0.0007, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-05, buffer_size=5000, replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1, verbose=1, tensorboard_log="./tb") model.learn(total_timesteps=int(1e7), callback=callback, tb_log_name="acer") model.save("models/pacman_acer.pkl")
def train_ACER(env_train, model_name, timesteps=25000): start = time.time() model = ACER('MlpPolicy', env_train, verbose=0) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (A2C): ', (end - start) / 60, ' minutes') return model
def train_acer(timesteps, name): env = datares_roulette env = DummyVecEnv([env]) model = ACER( stable_baselines.common.policies.MlpPolicy, env, verbose=1, ) model.learn(total_timesteps=timesteps) model.save(name) return model
from stable_baselines.common.cmd_util import make_atari_env from stable_baselines.common.vec_env import VecFrameStack from stable_baselines import ACER # There already exists an environment generator # that will make and wrap atari environments correctly. # Here we are also multiprocessing training (num_env=4 => 4 processes) env = make_atari_env('PongNoFrameskip-v4', num_env=4, seed=0) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) model = ACER('CnnPolicy', env, verbose=1) model.learn(total_timesteps=25000) # save model.save("cnn_pong") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
class ACERAgent(Agent): def __init__( self, model_name="model_name", save_dir="./models", log_interval=1e4, num_cpus=8, eval_episodes=1000, n_steps=1e6, layer_normalization=False, model_kwargs={"tensorboard_log": "./tensorboards/"}, env_kwargs={ "board_size": 4, "binary": True, "extractor": "cnn" }, callback_checkpoint_kwargs={ "save_freq": 0, "save_path": "./models/", "name_prefix": "model_name" }, callback_hist_kwargs={"hist_freq": 0}, ): super().__init__( model_name, save_dir, num_cpus, model_kwargs, env_kwargs, layer_normalization, callback_checkpoint_kwargs, callback_hist_kwargs, n_steps, log_interval, eval_episodes, ) self._init_model() def _init_model(self): if not self._model_kwargs["agent"].lower() == "acer": raise ValueError( "The model_kwargs dict has to be created using args from ACER agent as reference. Make sure the correct parameters models." ) del self._model_kwargs["agent"] self._callback_checkpoint_kwargs["save_freq"] = int( self._callback_checkpoint_kwargs["save_freq"] / self._num_cpus) if self._env_kwargs["extractor"] == "mlp": self._model = ACER(CustomMlpPolicy, self._env, **self._model_kwargs) else: self._model = ACER(CustomCnnPolicy, self._env, **self._model_kwargs) def train(self): "Optimize the model." callbacks = [] # Checkpoint callback if self._callback_checkpoint_kwargs["save_freq"] > 0: # Append model name into checkpoint save_path self._callback_checkpoint_kwargs["save_path"] = ( self._callback_checkpoint_kwargs["save_path"] + "/" + str(self._model_name)) checkpoint_callback = CheckpointCallback( **self._callback_checkpoint_kwargs) callbacks.append(checkpoint_callback) if self._callback_hist_kwargs["hist_freq"] > 0: # hist_callback = CustomCallbackPPO2(**self._callback_hist_kwargs) # callbacks.append(hist_callback) pass try: self._model.learn(self._n_steps, log_interval=self._log_interval, callback=callbacks, tb_log_name=self._model_name) except KeyboardInterrupt: pass folder_path = os.path.join(self._save_dir, self._model_name) self._model.save(os.path.join(folder_path, self._model_name)) def test(self): "Evaluate the model." mean_reward = super()._test(self._model) return mean_reward
import gym from stable_baselines import ACER from stable_baselines.common.policies import CnnPolicy from stable_baselines.common.vec_env import DummyVecEnv # trying to get an idea of how quickly my computer can train this pong_env = gym.make('Pong-v0') pong_env = DummyVecEnv([lambda: pong_env]) pong_model_acer = ACER( CnnPolicy, pong_env, verbose=0, tensorboard_log="./../../data/baselines-stuff/pong/acer_pong_tensorboard/") pong_model_acer.learn(total_timesteps=50_000_000, tb_log_name="run-1-50_000_000") # since I know I'll be stopping it early pong_model_acer.save( './../../data/baselines-stuff/pong/terrible_pong_model_acer')
env = make_vec_env(RPiLEDEnv, env_kwargs=envArgsDict) callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-20, verbose=1) eval_callback = EvalCallback(env, best_model_save_path='./logs/best', log_path='./logs/', eval_freq=5000, deterministic=True, render=False, callback_on_new_best=callback_on_best) # Added checkpoint because I lost model data after a crash when the webcam shutdown because the screen went to sleep :( checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', name_prefix='ppo1_model') cb = CallbackList([checkpoint_callback, eval_callback]) policy_kwargs = {'layers': [128]} model = ACER(MlpLnLstmPolicy, env, verbose=1, policy_kwargs=policy_kwargs, tensorboard_log='./logs/') model.learn(total_timesteps=10000, callback=cb) model.save('acer_rpi_lid') print('model saved')
env_id = "/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux" #env = UnityEnv(env_id, worker_id=2, use_visual=False) # Create log dir time_int = int(time.time()) log_dir = "stable_results/basic_env_{}/".format(time_int) os.makedirs(log_dir, exist_ok=True) #env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run num_env = 2 worker_id = 9 env = SubprocVecEnv( [make_env(env_id, log_dir, i + worker_id) for i in range(num_env)]) model = ACER(MlpPolicy, env, verbose=1) model.learn(total_timesteps=30000) model.save(log_dir + "model") #evaluate agent episodes = 100 ep_r = [] ep_l = [] for e in range(episodes): obs = env.reset() total_r = 0. total_l = 0. while True: action, _states = model.predict(obs) obs, rewards, dones, infos = env.step(action) total_l += 1. total_r += rewards[0] if dones[0]:
from pathlib import Path from freqtrade.configuration import Configuration config = Configuration.from_files(['config_rl.json']) from freqtradegym import TradingEnv from stable_baselines.common.policies import MlpPolicy from stable_baselines import ACER if __name__ == "__main__": env = TradingEnv(config) policy_kwargs = dict(layers=[32, 32]) model = ACER(MlpPolicy, env, learning_rate=1e-4, policy_kwargs=policy_kwargs, verbose=0, tensorboard_log="./tensorboard/") model.learn(total_timesteps=int(1e+6)) model.save('model')