def train_TD3(self, model_name, model_params=config.TD3_PARAMS): """TD3 model""" from stable_baselines import TD3 from stable_baselines.common.noise import NormalActionNoise env_train = self.env n_actions = env_train.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) start = time.time() model = TD3('MlpPolicy', env_train, batch_size=model_params['batch_size'], buffer_size=model_params['buffer_size'], learning_rate=model_params['learning_rate'], action_noise=action_noise, verbose=model_params['verbose']) model.learn(total_timesteps=model_params['timesteps']) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end - start) / 60, ' minutes') return model
def __call__(self): policy_kwargs = dict(layers=[400, 300, 200, 100]) n_actions = self.env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # check_env(self.env) model = TD3(MlpPolicy, self.env, policy_kwargs=policy_kwargs, action_noise=action_noise, memory_limit=50000, tensorboard_log= "/home/dfki.uni-bremen.de/mpatil/Documents/baselines_log", verbose=1) time_steps = 3e4 model.learn(total_timesteps=int(time_steps), log_interval=50, tb_log_name="td3_Docker_" + self.expt_name) model.save( "/home/dfki.uni-bremen.de/mpatil/Documents/td3_stable_baselines_" + self.expt_name) print("Closing environment") self.env.close()
def train_TD3(env_train, model_name, model=None, timesteps=30000, save_path=None): """TD3 model""" # add the noise objects for TD3 n_actions = env_train.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) start = time.time() if model is None: model = TD3('MlpPolicy', env_train, action_noise=action_noise) else: model.set_env(env_train) model.verbose = config.VERBOSE model.learn(total_timesteps=timesteps) end = time.time() if save_path is None: save_path = f"{config.TRAINED_MODEL_DIR}/{model_name}" model.save(save_path) print('Training time (TD3): ', (end - start) / 60, ' minutes') return model
def sample_td3_params(trial): """ Sampler for TD3 hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 100, 128, 256, 512]) buffer_size = trial.suggest_categorical('buffer_size', [int(1e4), int(1e5), int(1e6)]) train_freq = trial.suggest_categorical('train_freq', [1, 10, 100, 1000, 2000]) gradient_steps = train_freq noise_type = trial.suggest_categorical('noise_type', ['ornstein-uhlenbeck', 'normal']) noise_std = trial.suggest_uniform('noise_std', 0, 1) hyperparams = { 'gamma': gamma, 'learning_rate': learning_rate, 'batch_size': batch_size, 'buffer_size': buffer_size, 'train_freq': train_freq, 'gradient_steps': gradient_steps, } if noise_type == 'normal': hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) elif noise_type == 'ornstein-uhlenbeck': hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) return hyperparams
def _on_step(self) -> bool: """ This method will be called by the model after each call to `env.step()`. For child callback (of an `EventCallback`), this will be called when the event is triggered. :return: (bool) If the callback returns False, training is aborted early. """ global SIGMA, LEARNING_RATE if (self.num_timesteps % 1000) ==0: # import pdb; pdb.set_trace() t = time.time() time_elapsed = t-self.startTime #seconds self.model.save("td3_model_int_test") SIGMA = SIGMA*.9 # LEARNING_RATE = LEARNING_RATE*.9 print("---------" + str(self.num_timesteps) +" steps complete | SIGMA = " + str(SIGMA) + " | Learning Rate: " + str(LEARNING_RATE) + "|----------") print("---------------Time Elapsed: " + str(time_elapsed) + " seconds") f = open(os.path.join(dirName, "learn.txt"), "a") f.write("---------" + str(self.num_timesteps) +" steps complete | SIGMA = " + str(SIGMA) + " | Learning Rate: " + str(LEARNING_RATE) + "|----------\n") f.write("--------- Time Elapsed: " + str(time_elapsed) + " seconds -----------\n") f.close() self.model.action_noise = NormalActionNoise(0,SIGMA) #annealed noise # self.model.learning_rate = LEARNING_RATE # td3_noise = OrnsteinUhlenbeckActionNoise(np.zeros(a_dim), sigma*np.ones(a_dim)) print("\t--Step Done --\t|") if yPos_global > 200: input("Please reset the robot to start and press enter key to continue..") return True
def sample_ddpg_params(trial): """ Sampler for DDPG hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1) # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 100, 128, 256, 512]) buffer_size = trial.suggest_categorical('memory_limit', [int(1e4), int(1e5), int(1e6)]) noise_type = trial.suggest_categorical('noise_type', ['ornstein-uhlenbeck', 'normal']) noise_std = trial.suggest_uniform('noise_std', 0, 1) normalize_observations = trial.suggest_categorical('normalize_observations', [True, False]) normalize_returns = trial.suggest_categorical('normalize_returns', [True, False]) hyperparams = { 'gamma': gamma, 'actor_lr': learning_rate, 'critic_lr': learning_rate, 'batch_size': batch_size, 'memory_limit': buffer_size, 'normalize_observations': normalize_observations, 'normalize_returns': normalize_returns } if noise_type == 'normal': hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(1), sigma=noise_std * np.ones(1)) elif noise_type == 'ornstein-uhlenbeck': hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(1), sigma=noise_std * np.ones(1)) return hyperparams
def sample_ddpg_params(trial): """ Sampler for DDPG hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical( 'gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1) # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128, 256]) buffer_size = trial.suggest_categorical( 'memory_limit', [int(1e4), int(1e5), int(1e6)]) noise_type = trial.suggest_categorical( 'noise_type', ['ornstein-uhlenbeck', 'normal', 'adaptive-param']) noise_std = trial.suggest_uniform('noise_std', 0, 1) normalize_observations = trial.suggest_categorical( 'normalize_observations', [True, False]) normalize_returns = trial.suggest_categorical('normalize_returns', [True, False]) hyperparams = { 'gamma': gamma, 'actor_lr': learning_rate, 'critic_lr': learning_rate, 'batch_size': batch_size, 'memory_limit': buffer_size, 'normalize_observations': normalize_observations, 'normalize_returns': normalize_returns } if noise_type == 'adaptive-param': hyperparams['param_noise'] = AdaptiveParamNoiseSpec( initial_stddev=noise_std, desired_action_stddev=noise_std) # Apply layer normalization when using parameter perturbation hyperparams['policy_kwargs'] = dict(layer_norm=True) elif noise_type == 'normal': hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) elif noise_type == 'ornstein-uhlenbeck': hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) return hyperparams
def start_unity_baselines(): # Set to FALSE for CIP-Pool execution # env = make_unity_env('./envs/worm_dynamic_one_agent/linux/worm_dynamic', 1, False) # InitialTrainingExample.start_training(env) # env.close() unity_env = UnityEnvironment( './envs/worm_dynamic_one_agent/linux/worm_dynamic', no_graphics=True) env = UnityToGymWrapper(unity_env, uint8_visual=False) env = Monitor(env, 'results/') # The noise objects for TD3 n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = TD3_Baselines(MlpPolicy, env, action_noise=action_noise, verbose=1) model.learn(total_timesteps=int(2e6), log_interval=10) model.save("td3_worm")
def test_deterministic_td3(): results = [[], []] rewards = [[], []] kwargs = {'n_cpu_tf_sess': 1} env_id = 'Pendulum-v0' kwargs.update({'action_noise': NormalActionNoise(0.0, 0.1)}) for i in range(2): model = TD3('MlpPolicy', env_id, seed=SEED, **kwargs) model.learn(N_STEPS_TRAINING) env = model.get_env() obs = env.reset() for _ in range(20): action, _ = model.predict(obs, deterministic=True) obs, reward, _, _ = env.step(action) results[i].append(action) rewards[i].append(reward) # without the extended tolerance, test fails for unknown reasons on Github... assert np.allclose(results[0], results[1], rtol=1e-2), results assert np.allclose(rewards[0], rewards[1], rtol=1e-2), rewards
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_TD3(trial) env = SubprocVecEnv([ lambda: NormalizeActionWrapper(LearningRocket(visualize=False)) for i in range(n_cpu) ]) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = TD3(MlpPolicy, env, action_noise=action_noise, policy_kwargs=dict(layers=[400, 300])) model.learn(50000) rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() step = 0 while n_episodes < 4: step += 1 action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, step) return -1 * last_reward
def DDPGgive_results(files, balance, shares=None): env = create_stock_env(files, train=False, balance=balance, shares=shares) max_steps = env.max_steps - env.num_prev env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(0, 2) param_noise = AdaptiveParamNoiseSpec(initial_stddev=1, desired_action_stddev=0.1, adoption_coefficient=1.01) model = DDPG(CustomDDPGPolicy, env, verbose=0, param_noise=param_noise, action_noise=action_noise) # model = DDPG.load("/home/harshit/Documents/itsp-trade agent/Reinforcement-Learning-Stock-Trader/WebPortal/StockApp/Stock_stable.zip",env=env) model.learn(total_timesteps=100) profit = 0 profitst = np.zeros((max_steps - 1, 2)) actionst = np.zeros((n_actions // 2, max_steps - 1, 2)) shares = np.zeros((len(files), max_steps - 1, 2)) obs = env.reset() for i in range(max_steps): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) actionst[:, i, 1] = -info[0]['action'][0][0:n_actions // 2] + info[0][ 'action'][0][n_actions // 2:] actionst[:, i, 0] = i shares[:, i, 1] = info[0]['shares_held'] shares[:, i, 0] = i # print('a',action) profit += rewards profitst[i] = [i, profit] if dones: break print(info[0]['action'][0]) print(actionst) return profitst.tolist(), shares.tolist(), actionst.tolist()
def test_deterministic_training_common(algo): results = [[], []] rewards = [[], []] kwargs = {'n_cpu_tf_sess': 1} if algo in [DDPG, TD3, SAC]: env_id = 'Pendulum-v0' kwargs.update({'action_noise': NormalActionNoise(0.0, 0.1)}) else: env_id = 'CartPole-v1' if algo == DQN: kwargs.update({'learning_starts': 100}) for i in range(2): model = algo('MlpPolicy', env_id, seed=SEED, **kwargs) model.learn(N_STEPS_TRAINING) env = model.get_env() obs = env.reset() for _ in range(100): action, _ = model.predict(obs, deterministic=False) obs, reward, _, _ = env.step(action) results[i].append(action) rewards[i].append(reward) assert sum(results[0]) == sum(results[1]), results assert sum(rewards[0]) == sum(rewards[1]), rewards
from stable_baselines.td3.policies import MlpPolicy from stable_baselines import TD3 from TD3_test import TD3_ff from FireflyEnv import firefly_acc from Config import Config arg = Config() import numpy as np from numpy import pi import time from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise from reward_functions import reward_singleff action_noise = NormalActionNoise(mean=np.zeros(2), sigma=float(0.1) * np.ones(2)) arg.goal_radius_range = [0.15, 0.3] arg.std_range = [0.02, 0.2, 0.02, 0.2] arg.TERMINAL_VEL = 0.025 # terminal velocity? # norm(action) that you believe as a signal to stop 0.1. arg.DELTA_T = 0.2 arg.EPISODE_LEN = 35 env = firefly_acc.FireflyAcc(arg) modelname = None # modelname='trained_agent/'+'TD_acc_control_retrain_1000000_1_5_6_57' if modelname is None: # new train model = TD3_ff( MlpPolicy, env, verbose=1, tensorboard_log="./Tensorboard/",
if args.save_freq > 0: callbacks.append(CheckpointCallback(save_freq=args.save_freq, save_path=save_path, name_prefix='rl_model')) algo = { 'sac': SAC, 'td3': TD3 }[args.algo] n_actions = env.action_space.shape[0] # Tuned hyperparameters from https://github.com/araffin/rl-baselines-zoo hyperparams = { 'sac': dict(batch_size=256, gamma=0.98, policy_kwargs=dict(layers=[256, 256]), learning_starts=10000, buffer_size=int(2e5), tau=0.01), 'td3': dict(batch_size=100, policy_kwargs=dict(layers=[400, 300]), learning_rate=1e-3, learning_starts=10000, buffer_size=int(1e6), train_freq=1000, gradient_steps=1000, action_noise=NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))) }[args.algo] model = algo('MlpPolicy', env, verbose=1, **hyperparams) try: model.learn(n_timesteps, callback=callbacks) except KeyboardInterrupt: pass print("Saving to {}.zip".format(save_path)) model.save(save_path)
def run_process(study_name, alg_param, env_param, log_path='.'): study_path = os.path.join(log_path, study_name) make_sure_path_exists(study_path) trial_path, trial_id = generate_trial_path(study_path) make_sure_path_exists(trial_path) with open(trial_path + '/alg_param.pkl', "wb+") as outfile: pickle.dump(alg_param, outfile) with open(trial_path + '/env_param.pkl', "wb+") as outfile: pickle.dump(env_param, outfile) num_nodes = alg_param['num_nodes'] num_layers = alg_param['num_layers'] learning_rate = alg_param['learning_rate'] alg = alg_param['alg'] nenv = alg_param['nenv'] env = build_env(trial_path, env_param, nenv=nenv) if alg == 'dqn': from stable_baselines.deepq.policies import MlpPolicy from stable_baselines import DQN call_iter = 1000 policy_kwargs = dict(layers=[num_nodes for _ in range(num_layers)]) model = DQN(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs, tensorboard_log=trial_path) #DDPG calls back every step of every rollout elif alg == 'ddpg': from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec from stable_baselines import DDPG call_iter = 1000 n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) policy_kwargs = dict(layers=[num_nodes for _ in range(num_layers)]) model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, policy_kwargs=policy_kwargs, tensorboard_log=trial_path) elif alg == 'td3': from stable_baselines import TD3 from stable_baselines.td3.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise call_iter = 1000 n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) policy_kwargs = dict(layers=[num_nodes for _ in range(num_layers)]) model = TD3(MlpPolicy, env, verbose=1, action_noise=action_noise, learning_rate=learning_rate, policy_kwargs=policy_kwargs, tensorboard_log=trial_path) #PPO1 calls back only after every rollout elif alg == 'ppo2': from stable_baselines.common.policies import MlpPolicy from stable_baselines import PPO2 call_iter = 100 policy_kwargs = dict(net_arch=[num_nodes for _ in range(num_layers)]) model = PPO2(MlpPolicy, env, policy_kwargs=policy_kwargs, verbose=1, learning_rate=learning_rate, tensorboard_log=trial_path, n_steps=alg_param['n_steps'], noptepochs=alg_param['noptepochs'], nminibatches=alg_param['nminibatches'], gamma=alg_param['gamma'], ent_coef=alg_param['ent_coef'], cliprange=alg_param['cliprange'], lam=alg_param['lam']) best_mean_reward, n_steps = -np.inf, 0 #callback frequency differs among algorithms def callback(_locals, _globals): from stable_baselines.results_plotter import load_results, ts2xy nonlocal n_steps, best_mean_reward, call_iter # Print stats every 1000 call if (n_steps + 1) % call_iter == 0: # Evaluate policy training performance x, y = ts2xy(load_results(trial_path), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-200:]) print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(trial_path + '/best_model.pkl') n_steps += 1 return True # model= DDPG.load('log/A00/best_model.pkl') # model.set_env(env) print(f"Starting to train {trial_id}") model.learn(total_timesteps=int(1e6), tb_log_name='tb_log', callback=callback) model.save(trial_path + '/fully_trained_model')
#run suscriber nodes RL_subscribers() print("Starting...") time.sleep(3) #give ros time to set up #init environmnet env = soft_learner() print('done') a_dim = env.action_space.shape[0] # td3_noise = OrnsteinUhlenbeckActionNoise(np.zeros(a_dim), .9*np.ones(a_dim)) td3_noise = NormalActionNoise(0,SIGMA) td3_env = DummyVecEnv([lambda: env]) # td3_env = env checkpoint_on_event = CheckpointCallback(save_freq=1000, save_path= "./logs/model_checkpoints", name_prefix='rl_model') event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) eval_callback = EvalCallback(td3_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=100, deterministic=True, render=False) # td3_model.learning_starts = 100
SIM_NUMBER = 999 ##Training policies #CustomPolicy_3 #CustomPolicy_2 Standard mlp stable baselines policy with modified layer-size #CustomPolicy_4 Modified initialization of layers and layer-size policy = CustomPolicy_4 env.episode_duration = NSTEPS ## the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = NormalActionNoise(0,range_esp) # # model = DDPG(policy, env, verbose=1,nb_train_steps=NSTEPS, nb_rollout_steps=NSTEPS,nb_eval_steps=NSTEPS,gamma=DECAY_RATE, param_noise=None, action_noise=action_noise,batch_size=BATCH_SIZE,actor_lr=POLICY_LEARNING_RATE, # critic_lr = QVALUE_LEARNING_RATE,buffer_size=REPLAY_SIZE,tau= UPDATE_RATE) model = DDPG(policy, env, verbose=1,nb_train_steps=1, nb_rollout_steps=1,nb_eval_steps=0,gamma=DECAY_RATE, param_noise=None, action_noise=action_noise,batch_size=BATCH_SIZE,actor_lr=POLICY_LEARNING_RATE, critic_lr = QVALUE_LEARNING_RATE,buffer_size=REPLAY_SIZE,tau= UPDATE_RATE) # mean_reward, std_reward = evaluate_policy(model, env_eval, n_eval_episodes=10) # print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}') start_time = time.time() model.learn(total_timesteps=NSTEPS*NEPISODES) end_time=time.time() elapsed_time = end_time-start_time model.save("ddpg_pendulum_stb_baselines_"+str(SIM_NUMBER)) print('elapsed '+str(elapsed_time)+'s')
def __init__(self, algorithm="SAC", load=True, agent_name="Agent001"): self.agent_name = agent_name #self.env = LearningRocket(visualize=False) #self.env = NormalizeActionWrapper(self.env) #self.eval_env = LearningRocket(visualize=True) #self.eval_env = NormalizeActionWrapper(self.eval_env) #self.env = SubprocVecEnv([lambda: LearningRocket(visualize=False) for i in range(4)]) self.env = make_vec_env( LearningRocket, n_envs=16 ) #[lambda: LearningRocket(visualize=False) for i in range(16)])) #self.eval_env = VecNormalize(DummyVecEnv([lambda: LearningRocket(visualize=True) for i in range(1)])) self.eval_env = make_vec_env(lambda: LearningRocket(visualize=True), n_envs=1) #self.eval_env = VecNormalize(self.eval_env) self.eval_callback = EvalCallback(self.eval_env, best_model_save_path='Agent007', log_path='./logs/', eval_freq=10000, deterministic=True, render=False, n_eval_episodes=1) kai_policy = dict(act_fun=tf.nn.tanh, net_arch=[400, 300]) #check_env(self.env, warn=True) """ if algorithm == "SAC": if load is True: self.model = SAC.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") #self.model.ent_coef=0.2 else: self.model = SAC('MlpPolicy', self.env, verbose=1, tensorboard_log="./rocket_tensorboard/",ent_coef=5) print("Trainer Set for SAC") """ if algorithm == "TD3": n_actions = self.env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) if load is True: self.model = TD3.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") #file = open('replay_buffer', 'rb') #self.model.replay_buffer = pickle.load(file) #file.close() else: self.model = TD3(MlpPolicy, self.env, action_noise=action_noise, batch_size=768, gamma=0.95, learning_rate=1e-4, learning_starts=20000, verbose=1, tensorboard_log="./rocket_tensorboard/", policy_kwargs=dict(layers=[400, 300])) print("Trainer Set for TD3") elif algorithm == "PPO2": if load is True: self.model = PPO2.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") self.eval_env = VecNormalize.load(self.agent_name + "vEnv", self.eval_env) #self.eval_env.clip_obs = 500 #self.env = VecNormalize(self.env) self.env = VecNormalize.load(self.agent_name + "vEnv", self.env) #self.env.clip_obs = 500 #self.env.norm_obs = False #self.eval_env.norm_obs = False else: self.model = PPO2(PPOMlpPolicy, self.env, n_steps=1024, nminibatches=32, lam=0.98, gamma=0.999, noptepochs=4, ent_coef=0.01, verbose=1, tensorboard_log="./rocket_tensorboard/", policy_kwargs=dict(layers=[400, 300])) self.eval_env = VecNormalize(self.eval_env) self.env = VecNormalize(self.env) #self.eval_env.clip_obs = 500 #self.env.clip_obs = 500 #self.env.norm_obs=False #self.eval_env.norm_obs=False print("Trainer set for PPO2. I am speed.")
n_actions = env.action_space.shape[0] if 'adaptive-param' in noise_type: assert algo_ == 'ddpg', 'Parameter is not supported by SAC' hyperparams['param_noise'] = AdaptiveParamNoiseSpec( initial_stddev=noise_std, desired_action_stddev=noise_std) elif 'normal' in noise_type: if 'lin' in noise_type: hyperparams['action_noise'] = LinearNormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions), final_sigma=hyperparams.get('noise_std_final', 0.0) * np.ones(n_actions), max_steps=n_timesteps) else: hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) elif 'ornstein-uhlenbeck' in noise_type: hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) else: raise RuntimeError('Unknown noise type "{}"'.format(noise_type)) print("Applying {} noise with std {}".format(noise_type, noise_std)) del hyperparams['noise_type'] del hyperparams['noise_std'] if 'noise_std_final' in hyperparams: del hyperparams['noise_std_final'] if ALGOS[args.algo] is None: raise ValueError('{} requires MPI to be installed'.format(args.algo))
import pytest import numpy as np from stable_baselines import TD3, PPO from stable_baselines.common.noise import NormalActionNoise action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1)) def test_td3(): model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), seed=0, learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise) model.learn(total_timesteps=10000, eval_freq=5000) # model.save("test_save") # model.load("test_save") # os.remove("test_save.zip") @pytest.mark.parametrize("model_class", [PPO]) @pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0']) def test_onpolicy(model_class, env_id): model = model_class('MlpPolicy', env_id, policy_kwargs=dict(net_arch=[16]), verbose=1,
def _preprocess_hyperparams(self, _hyperparams): # Convert to python object if needed if "policy_kwargs" in _hyperparams.keys() and isinstance(_hyperparams["policy_kwargs"], str): _hyperparams["policy_kwargs"] = eval(_hyperparams["policy_kwargs"]) n_timesteps = _hyperparams.pop("n_timesteps", None) n_envs = _hyperparams.pop("n_envs", None) log_every = _hyperparams.pop("log_every", None) if not self.continue_learning: if not log_every: self.logger.debug("log_every not defined in yml file: using command line log_every {}".format(self.log_every)) log_every = self.log_every else: self.logger.debug("using log_every as defined in yml file: {}".format(log_every)) else: self.logger.debug("priority to command line log_every {}".format(self.log_every)) log_every = self.log_every # Parse noise string if self.algo_name in ["ddpg", "sac", "td3"] and _hyperparams.get("noise_type") is not None: noise_type = _hyperparams["noise_type"].strip() noise_std = _hyperparams["noise_std"] n_actions = get_n_actions(env_name=self.env_name, env_variables=self.env_kwargs) self.logger.debug("n_actions: {}".format(n_actions)) if "adaptive-param" in noise_type: assert self.algo_name == "ddpg", "Parameter is not supported by SAC" _hyperparams["param_noise"] = AdaptiveParamNoiseSpec(initial_stddev=noise_std, desired_action_stddev=noise_std) elif "normal" in noise_type: if "lin" in noise_type: _hyperparams["action_noise"] = LinearNormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions), final_sigma=_hyperparams.get("noise_std_final", 0.0) * np.ones(n_actions), max_steps=n_timesteps, ) else: _hyperparams["action_noise"] = NormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions) ) elif "ornstein-uhlenbeck" in noise_type: _hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions) ) else: raise RuntimeError('Unknown noise type "{}"'.format(noise_type)) self.logger.debug("Applying {} noise with std {}".format(noise_type, noise_std)) del _hyperparams["noise_type"] del _hyperparams["noise_std"] if "noise_std_final" in _hyperparams: del _hyperparams["noise_std_final"] normalize_kwargs = _parse_normalize(dictionary=_hyperparams) if n_envs is None: self.logger.debug("n_envs not defined in yml file: using command line n_envs {}".format(self.num_envs)) n_envs = self.num_envs else: self.logger.debug("using n_envs as num of envs defined in yml file:".format(n_envs)) if not self.continue_learning: # priority to yml defined n_timesteps if n_timesteps is None: self.logger.debug( "n_timesteps not defined in yml file: using command line n_timesteps {}".format(self.train_total_timesteps) ) n_timesteps = self.train_total_timesteps else: self.logger.debug("using n_timesteps as total timesteps defined in yml file: {}".format(n_timesteps)) n_timesteps = int(n_timesteps) else: if self.train_total_timesteps and self.train_total_timesteps != -1: assert self.train_total_timesteps <= int(n_timesteps), "train_total_timesteps <= n_timesteps: {}, {}".format( self.train_total_timesteps, n_timesteps ) # priority to command line n_timesteps self.logger.debug("priority to command line n_timesteps {}".format(self.train_total_timesteps)) n_timesteps = self.train_total_timesteps elif self.train_total_timesteps == -1: assert n_timesteps, "n_timesteps should have a value: {}".format(n_timesteps) n_timesteps = int(n_timesteps) self.logger.info("training in continual learning = training from scratch. n_timesteps {}".format(n_timesteps)) else: assert n_timesteps, "n_timesteps should have a value: {}".format(n_timesteps) n_timesteps = int(n_timesteps // 2) self.logger.debug( "train_total_timesteps not specified in continue_learning: " "taking half of original n_timesteps defined in yml file {}".format(n_timesteps) ) assert n_timesteps % log_every == 0, "it should be possible to divide n_timesteps for log_every: {}, {}".format( n_timesteps, log_every ) return normalize_kwargs, n_envs, n_timesteps, log_every, _hyperparams
act = tf.nn.tanh if args.algo == 'TD4_IQN': model = TD4('MlpPolicy', env, gamma=0.99, buffer_size=int(1e5), learning_starts=10000, tau=args.tau, policy_delay=args.policy_delay, batch_size=128, learning_rate=1e-3, train_freq=args.train_freq, gradient_steps=args.train_freq, verbose=args.verbose, action_noise=NormalActionNoise(0, sigma=0.1), n_support=args.n_support, risk_factor=args.riskfactor, policy_kwargs=dict(layers=[128, 128], act_fun=act), model_type="IQN", tensorboard_log=args.logdir + env_name, seed=args.seed) if args.algo == 'TD4_FQF': model = TD4('MlpPolicy', env, gamma=0.99, buffer_size=int(1e5), learning_starts=10000, tau=args.tau, policy_delay=args.policy_delay, batch_size=128,
import gym import numpy as np import gym import gym_routing from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec from stable_baselines import DDPG env = gym.make('zzz-v1') # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None #action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) try: model = DDPG.load( "ddpg_0308", env=env, tensorboard_log= "/home/carla/openai_baselines_update/stable_baseline/log/0308/") print("load saved model") except: model = DDPG( MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise,