def train_DDPG(self, model_name, model_params = config.DDPG_PARAMS): """DDPG model""" from stable_baselines3 import DDPG from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise env_train = self.env n_actions = env_train.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1*np.ones(n_actions)) start = time.time() model = DDPG('MlpPolicy', env_train, batch_size=model_params['batch_size'], buffer_size=model_params['buffer_size'], action_noise=action_noise, verbose=model_params['verbose'], tensorboard_log = f"{config.TENSORBOARD_LOG_DIR}/{model_name}" ) model.learn(total_timesteps=model_params['timesteps'], tb_log_name = "DDPG_run") end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end-start)/60,' minutes') return model
def ddpg(env, hyper, policy="MlpPolicy", verbose=0, tensorboard_log=None, seed=0, use_sde=True, device="auto"): policy_kwargs = make_policy_kwargs(hyper, "ddpg") hyper = action_noise(hyper, "ddpg", n_actions=env.action_space.shape[0]) model = DDPG( 'MlpPolicy', env, verbose=verbose, tensorboard_log=tensorboard_log, seed=seed, gamma=hyper['params_gamma'], learning_rate=hyper['params_lr'], batch_size=np.int(hyper['params_batch_size']), buffer_size=np.int(hyper['params_buffer_size']), action_noise=hyper['params_action_noise'], train_freq=hyper['params_train_freq'], # gradient_steps = np.int(hyper['params_train_freq']), # n_episodes_rollout = np.int(hyper['params_n_episodes_rollout']), policy_kwargs=policy_kwargs, device=device) return model
def create_model(env, algorithm, save_path): # the noise object n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.2) * np.ones(n_actions), theta=0.15) if algorithm == "ddpg": return DDPG(DDPG_MlpPolicy, env, learning_rate=0.001, buffer_size=1000000, batch_size=64, tau=0.001, gamma=0.99, train_freq=(10, "step"), action_noise=action_noise, policy_kwargs=dict(optimizer_class=th.optim.AdamW), tensorboard_log=save_path) elif algorithm == "td3": return TD3(TD3_MlpPolicy, env, action_noise=action_noise, tensorboard_log=save_path) elif algorithm == "sac": return SAC(SAC_MlpPolicy, env, action_noise=action_noise, tensorboard_log=save_path) else: raise Exception("--> Alican's LOG: Unknown agent type!")
def train(): best_reward, best_reward_timesteps = None, None save_path = "model_save/"+MODEL_PATH+"/" if save_path is not None: os.makedirs(save_path, exist_ok=True) # log_dir = f"model_save/" log_dir = save_path env, env_eval = ENV(util='train', par=PARAM, dt=DT), ENV(util='val', par=PARAM, dt=DT) env, env_eval = Monitor(env, log_dir), Monitor(env_eval, log_dir) env, env_eval = DummyVecEnv([lambda: env]), DummyVecEnv([lambda: env_eval]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) if PARAM['algo']=='td3': model = TD3('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'], learning_starts=PARAM['learning_starts']) elif PARAM['algo']=='ddpg': model = DDPG('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'], learning_starts=PARAM['learning_starts']) elif PARAM['algo']=='ppo': model = PPO('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed']) eval_callback = EvalCallback(env_eval, best_model_save_path=save_path+MODEL_PATH+'_best_model', log_path=log_dir, eval_freq=PARAM['eval_freq'], save_freq=PARAM['save_freq'], deterministic=True, render=False) model.learn(total_timesteps=int(PARAM['total_time_step']), callback=eval_callback, log_interval = 500) print("best mean reward:", eval_callback.best_mean_reward_overall, "timesteps:", eval_callback.best_mean_reward_timestep) model.save(save_path+MODEL_PATH+'_final_timesteps')
def main(): """ # Example with Vectorized env num_cpu = 4 # Number of processes to use my_env_kwargs={'renders': False} env = make_vec_env('panda-ip-reach-v0', n_envs=num_cpu, env_kwargs=my_env_kwargs) """ # Example with a simple Dummy vec env env = gym.envs.make('panda-ip-reach-v0', renders=False) env = DummyVecEnv([lambda: env]) #check_env(pandaenv) # The noise objects for DDPG n_actions = env.action_space.shape[-1] print("n_actions = {0}".format(n_actions)) #action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = DDPG(policy='MlpPolicy', env=env, learning_rate=0.001, buffer_size=1000000, learning_starts=100, batch_size=100, tau=0.005, gamma=0.99, train_freq=1, gradient_steps=-1, action_noise=action_noise, optimize_memory_usage=False, tensorboard_log="./ddpg_panda_reach_tensorboard/", create_eval_env=False, policy_kwargs=None, verbose=1, seed=None, device='auto', _init_setup_model=True) """ print("start model evaluation without learning !") mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=1) print("end model evaluation !") """ print("start model learning !") model.learn(total_timesteps=200000, log_interval=10) print("end model learning !") print("-> model saved !!") model.save("ddpg_panda_reach") """ print("start model evaluation with learning !") mean_reward_after, std_reward_after = evaluate_policy(model, env, n_eval_episodes=1) print("end model evaluation !") """ """
def objective(trial): noise = trial.suggest_uniform('Noise', 0.1, 0.8) timesteps = trial.suggest_int('Timesteps', 10, 100) n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(noise) * np.ones(n_actions)) model = DDPG('MlpPolicy', env, action_noise=action_noise) model.learn(total_timesteps=timesteps * 1000, log_interval=1000) return test_model(env, model, '')
def train_ddpg(): log_dir = f"model_save/" env = ENV(istest=False) env = Monitor(env, log_dir) env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) model = DDPG("CnnPolicy", env, policy_kwargs=policy_kwargs, verbose=1, batch_size=2048, seed=1, learning_starts=500000) callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir) model.learn(total_timesteps=int(1000000), callback = callback, log_interval = 480) model.save('model_save/ddpg_cnn')
def __init__(self, env, hyperparameters=DEFAULT_HYPERPARAMETERS): self.P = hyperparameters if self.P["model_class"] == "dqn": from stable_baselines3 import DQN self.model = DQN('MlpPolicy', env, verbose=self.P["verbose"]) self.model_class = DQN elif self.P["model_class"] == "a2c": from stable_baselines3 import A2C from stable_baselines3.a2c import MlpPolicy self.model = A2C(MlpPolicy, env, verbose=self.P["verbose"]) self.model_class = A2C elif self.P["model_class"] == "ddpg": from stable_baselines3 import DDPG from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) self.model = DDPG('MlpPolicy', env, action_noise=action_noise, verbose=self.P["verbose"]) self.model_class = DDPG elif self.P["model_class"] == "td3": from stable_baselines3 import TD3 from stable_baselines3.td3.policies import MlpPolicy from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) self.model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=self.P["verbose"]) self.model_class = TD3 elif self.P["model_class"] == "ppo": from stable_baselines3 import PPO from stable_baselines3.ppo import MlpPolicy self.model = PPO(MlpPolicy, env, verbose=self.P["verbose"]) self.model_class = PPO elif self.P["model_class"] == "sac": from stable_baselines3 import SAC from stable_baselines3.sac import MlpPolicy self.model = SAC(MlpPolicy, env, verbose=self.P["verbose"]) self.model_class = SAC else: raise NotImplementedError()
def train_DDPG(env_train, model_name, timesteps=10000): """DDPG model""" # add the noise objects for DDPG n_actions = env_train.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) start = time.time() model = DDPG('MlpPolicy', env_train, action_noise=action_noise) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end-start)/60,' minutes') return model
def create(self, n_envs=1): """Create the agent""" self.env = self.agent_helper.env log_dir = self.agent_helper.config_dir os.makedirs(log_dir, exist_ok=True) self.env = Monitor(self.env, log_dir) #TODO: # Create DDPG policy and define its hyper parameter here! even the action space and observation space. # add policy policy_name = self.agent_helper.config['policy'] self.policy = eval(policy_name) # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) n_actions = int(self.agent_helper.env.action_space.shape[0]) action_noise = NormalActionNoise( mean=np.zeros(n_actions), sigma=self.agent_helper.config['rand_sigma'] * np.ones(n_actions)) #FIXME: test: # self.model = DDPG("MlpPolicy", self.env, action_noise=action_noise, verbose=1, tensorboard_log=self.agent_helper.graph_path) # TODO: fix the obvervation space and action space later. Test if the obervation space input is correct? Output action space is correct? # activ_function_name = self.agent_helper.config['nn_activ'] # activ_function = eval(activ_function_name) # policy_kwargs = dict(activation_fn=activ_function, # net_arch=[dict(pi=[32, 32], qf=[32, 32])]) logger.info("Create the DDPG model") policy_kwargs = dict(net_arch=self.agent_helper.config['layers']) self.model = DDPG( self.policy, self.env, learning_rate=self.agent_helper.config['learning_rate'], buffer_size=self.agent_helper.config['buffer_size'], batch_size=self.agent_helper.config['batch_size'], tau=self.agent_helper.config['tau'], gamma=self.agent_helper.config['gamma'], gradient_steps=self.agent_helper.config['gradient_steps'], action_noise=action_noise, optimize_memory_usage=self.agent_helper. config['optimize_memory_usage'], create_eval_env=self.agent_helper.config['create_eval_env'], policy_kwargs=policy_kwargs, verbose=self.agent_helper.config['verbose'], learning_starts=self.agent_helper.config['learning_starts'], tensorboard_log=self.agent_helper.graph_path, seed=self.agent_helper.seed) pass
def main(): # Create log dir log_dir = './ddpg_data' os.makedirs(log_dir, exist_ok=True) vix_env = trading_vix_env.trading_vix_env() env = Monitor(vix_env, log_dir) # Create action noise because TD3 and DDPG use a deterministic policy n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # Create the callback: check every 20000 steps callback = custom_call_back.CustomCallback(check_freq = 20000,log_dir = log_dir) # Create RL model model = DDPG('MlpPolicy',env,action_noise = action_noise, verbose=2,batch_size = 10000) # Train the agent model.learn(total_timesteps=int(5e9), callback=callback)
def train_DDPG(env_train, model_name, timesteps=10000): """DDPG model""" # the noise objects for DDPG n_actions = env_train.action_space.shape[-1] # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) start = time.time() param_noise = None # removed keyword "param_noise=param_noise" stable_baselines3 doesn't need this one model = DDPG('MlpPolicy', env_train, action_noise=action_noise) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end - start) / 60, ' minutes') return model
def train(): log_dir = f"model_save/" env = ENV(istest=False) env = Monitor(env, log_dir) env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) model = DDPG('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'], learning_starts=PARAM['learning_starts']) callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir) model.learn(total_timesteps=int(PARAM['total_time_step']), callback=callback, log_interval=480) model.save('model_save/' + MODEL_PATH)
def train_DDPG(env): print(f"action space shape -1:{env.action_space.shape[-1]}") # The noise objects for TD3 n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.02 * np.ones(n_actions)) model = DDPG( 'MlpPolicy', env, learning_rate=0.0003, learning_starts=5, train_freq=10, n_episodes_rollout=-1, buffer_size=100000, action_noise=action_noise, batch_size=128, verbose=2, ) model.learn(total_timesteps=1000000, log_interval=1) model.save("DDPG_pkl")
seed=args.seed, tensorboard_log=args.tensorboard) #--------------------------------------------------------# # DDPG # #--------------------------------------------------------# elif args.algorithm == 'DDPG': if args.sigma: # noise objects for DDPG n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = DDPG("MlpPolicy", env, action_noise=action_noise, verbose=1, seed=args.seed, tensorboard_log=args.tensorboard) #--------------------------------------------------------# # A2C # #--------------------------------------------------------# elif args.algorithm == 'A2C': model = A2C('MlpPolicy', env, verbose=1, learning_rate=args.learning_rate, n_steps=args.n_steps, gamma=args.gamma, gae_lambda=args.gae_lambda, ent_coef=args.ent_coef, vf_coef=args.vf_coef,
return True if __name__ == '__main__': # Instantiate Environment env_id = 'gym_spm:spm-v0' env = gym.make('gym_spm:spm-v0') # HyperParameters lr = 3e-4 # Instantiate Model n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=-30 * np.zeros(n_actions), sigma=.75 * np.ones(n_actions)) model = DDPG('MlpPolicy', env, action_noise=action_noise, verbose=1) # model = PPO('MlpPolicy', env, tensorboard_log=log_dir) # Train OR Load Model model.learn(total_timesteps=25000) # model.save(model_dir_description) mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print("Mean Reward = ", mean_reward) epsi_sp_list = [] action_list = []
verbose=1) if ARGS.algo == 'td3': model = TD3(td3ddpgMlpPolicy, train_env, policy_kwargs=offpolicy_kwargs, tensorboard_log=filename + '/tb/', verbose=1) if ARGS.obs == ObservationType.KIN else TD3( td3ddpgCnnPolicy, train_env, policy_kwargs=offpolicy_kwargs, tensorboard_log=filename + '/tb/', verbose=1) if ARGS.algo == 'ddpg': model = DDPG(td3ddpgMlpPolicy, train_env, policy_kwargs=offpolicy_kwargs, tensorboard_log=filename + '/tb/', verbose=1) if ARGS.obs == ObservationType.KIN else DDPG( td3ddpgCnnPolicy, train_env, policy_kwargs=offpolicy_kwargs, tensorboard_log=filename + '/tb/', verbose=1) #### Create eveluation environment ######################### if ARGS.obs == ObservationType.KIN: eval_env = gym.make( env_name, aggregate_phy_steps=shared_constants.AGGR_PHY_STEPS, obs=ARGS.obs, act=ARGS.act)
log_dir = "./Logs/DDPG/" model_dir = "./Models/DDPG/" details = f"Model_v{train_version}_" + description log_dir_description = log_dir + details model_dir_description = model_dir + details # Instantiate Model n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=25.67 * np.ones(n_actions)) model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log=log_dir) # Train OR Load Model if train_model: model.learn(total_timesteps=25000, tb_log_name=details) model.save(model_dir_description) else: model.load(model_dir_description) mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print("Mean Reward = ", mean_reward)
deterministic=True, render=False) ### DDPG Noise ### Try increasing the noise when retraining. ### Try less noise based on the policy plot. n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=1 * np.ones(n_actions)) # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = DDPG( 'MlpPolicy', env, action_noise=action_noise, verbose=1, tensorboard_log="./h={}/".format(horizons[rank]), gamma=0.99, learning_rate=0.0003, ) # model = DDPG.load("Model_DDPG_FS_30.zip") # model.learning_rate = 0.0003 # model.gamma = 0.99 # action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.05*np.ones(n_actions)) # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.075 * np.ones(n_actions)) # model.action_noise = action_noise trainer = Trainer(env) trainer.retrain_rl(model, episodes=20000, path="./h={}/".format(horizons[rank]))
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100) leaderboard("A2C", ENV, mean_reward, std_reward, url) print("algo:", "A2C", "env:", ENV, "mean reward:", mean_reward, "std:", std_reward) ## simulate and plot results for reference df = env.simulate(model, reps=10) env.plot(df, "results/a2c.png") #policy = env.policyfn(model, reps=10) #env.plot(policy, "results/a2c-policy.png") ## DDPG ###################################################################### # FIXME load best tuned parameters first... model = DDPG('MlpPolicy', env, verbose=0, tensorboard_log=tensorboard_log) model.learn(total_timesteps=300000) mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100) leaderboard("DDPG", ENV, mean_reward, std_reward, url) print("algo:", "DDPG", "env:", ENV, "mean reward:", mean_reward, "std:", std_reward) ## simulate and plot results for reference df = env.simulate(model, reps=10) env.plot(df, "results/ddpg.png") #policy = env.policyfn(model, reps=10) #env.plot(policy, "results/ddpg-policy.png") ## SAC ####################################################################### # FIXME read from YAML
import os import pybullet_envs import kukakr5Arc env = gym.make('kukakr5Arc-v1') # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise) model = DDPG(MlpPolicy, env, verbose=1, action_noise=action_noise) model.learn(total_timesteps=400000) model.save( "/home/nightmareforev/git/bullet_stuff/multi_kuka_sim/kukakr5Arc/envs/saved_policies/kukakr5Arc_reacher" ) print('Saving model.... Model saved') del model # remove to demonstrate saving and loading model = DDPG.load( "/home/nightmareforev/git/bullet_stuff/multi_kuka_sim/kukakr5Arc/envs/saved_policies/kukakr5Arc_reacher", env=env) print('Loading model.....Model loaded') #env.render() goes before env.reset() for the render to work #env.render()
if hyper["noise_type"] == "normal": hyper["action_noise"] = NormalActionNoise(mean=np.zeros(n_actions), sigma=hyper['noise_std'] * np.ones(n_actions)) elif noise_type == "ornstein-uhlenbeck": hyper["action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=hyper['noise_std'] * np.ones(n_actions)) model = DDPG('MlpPolicy', env, verbose=0, tensorboard_log=tensorboard_log, seed=seed, gamma=hyper['gamma'], learning_rate=hyper['lr'], batch_size=hyper['batch_size'], buffer_size=hyper['buffer_size'], action_noise=hyper['action_noise'], train_freq=hyper['train_freq'], gradient_steps=hyper['train_freq'], n_episodes_rollout=hyper['n_episodes_rollout'], policy_kwargs=policy_kwargs) model = DDPG('MlpPolicy', env, verbose=0, tensorboard_log=tensorboard_log) model.learn(total_timesteps=300000) mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100) # Rescale score against optimum solution in this environment opt = escapement(env) opt_reward, std_reward = evaluate_policy(opt, env, n_eval_episodes=100) mean_reward = mean_reward / opt_reward std_reward = std_reward / opt_reward leaderboard("DDPG", ENV, mean_reward, std_reward, url)
from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3.common.evaluation import evaluate_policy env = gym.make("IntelligentPantry-v1") #env = gym.make("Reacher-v2") observation = env.reset() print(env.action_space) a = 0.45 b = 0.45 f = 1200 log_path = os.path.join('training', 'Logs') #env = DummyVecEnv([lambda: env]) model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path) model3 = TD3("MlpPolicy", env, verbose=1, tensorboard_log=log_path) model2 = DDPG('MlpPolicy', env, verbose=1, tensorboard_log=log_path) model3.learn(total_timesteps=500000, log_interval=100) eval = evaluate_policy(model3, env, n_eval_episodes=20, render=True) # episodes = 5 # for episode in range(1, episodes+1): # state = env.reset() # done = False # score = 0 # # while not done: # env.render() # action = env.action_space.sample() # n_state, reward, done, info = env.step(action) # score += reward # print("Episode:{} Score:{}".format(episode, score)) # env.close()
if __name__ == '__main__': env_id = 'gym_spm:spm-v0' num_cpu = 4 # Number of processes to use env = gym.make('gym_spm:spm-v0') # env = make_vec_env(env_id, n_envs=1, seed=0) # env = VecCheckNan(env, raise_exception=True) # env = check_env(env) # The noise objects for DDPG n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=.1 * np.ones(n_actions)) # model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./TD3_spm_v2_SOC_point5_two_state/") model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./DDPG_spm_v2_SOC_point5_two_state/") model.learn(total_timesteps=25000, tb_log_name='DDPG_test_run_3_SOCpoint5_two_state') # model.save('DDPG_test_3_SOC_point5_two_states') # # # model.load('DDPG_test_2_SOC_point5_two_states') mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print("Mean Reward = ", mean_reward) epsi_sp_list = [] action_list = [] soc_list = [] Concentration_list = [] Concentration_list1 = []
env = SPMenv() # HyperParameters lr = 3e-4 model_name = "DDGP_2.pt" model_path = "./Model/" + model_name # Instantiate Model n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=.75 * np.ones(n_actions)) model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1, train_freq=25000, n_episodes_rollout=-1) # model = DDPG(MlpPolicy, env, verbose=1, train_freq=2500, n_episodes_rollout=-1) # wandb.watch(model) # Train OR Load Model model.learn(total_timesteps=25000) env.log_state = False model.save(model_path) mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
from stable_baselines3 import DDPG from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise from stable_baselines3.common.evaluation import evaluate_policy env = gym.make('Pendulum-v0') # The noise objects for DDPG n_actions = env.action_space.shape[-1] #action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = DDPG('MlpPolicy', env, action_noise=action_noise, verbose=1, tensorboard_log="./ddpg_pendulum_tensorboard/") print("start model evaluation without learning !") mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=100) print("end model evaluation !") print("start model learning !") model.learn(total_timesteps=10000, log_interval=10) print("end model learning !") print("-> model saved !!") model.save("ddpg_pendulum")
check_env(env, warn=True, skip_render_check=True) #### action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(env.N_ACTIONS), sigma=0.1 * np.ones(env.N_ACTIONS), dt=0.005) #### Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) #### Train the model ############################################################################### model = DDPG(CustomPolicy, env, verbose=1, batch_size=64, action_noise=action_noise) for i in range(step_iters): # run for step_iters * training_timesteps model.learn(total_timesteps=training_timesteps) model.save("./models/ddpg" + str((i + 1) * training_timesteps)) model.save_replay_buffer("./experiences/ddpg_experience" + str((i + 1) * training_timesteps)) #### Show (and record a video of) the model's performance ########################################## env_test = RLTetherAviary(gui=False, record=True) obs = env_test.reset() start = time.time()
def run_model_stablebaseline(flow_params, num_cpus=1, rollout_size=50, num_steps=50, algorithm="ppo", exp_config=None): """Run the model for num_steps if provided. Parameters ---------- flow_params : dict flow-specific parameters num_cpus : int number of CPUs used during training rollout_size : int length of a single rollout num_steps : int total number of training steps The total rollout length is rollout_size. Returns ------- stable_baselines.* the trained model """ from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv if num_cpus == 1: constructor = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: constructor]) else: env = SubprocVecEnv([ env_constructor(params=flow_params, version=i) for i in range(num_cpus) ]) if algorithm == "PPO": from stable_baselines3 import PPO train_model = PPO('MlpPolicy', env, verbose=1, n_steps=rollout_size) train_model.learn(total_timesteps=num_steps) print("Learning Process is Done.") return train_model elif algorithm == "DDPG": from stable_baselines3 import DDPG from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise import numpy as np if exp_config == 'singleagent_figure_eight': train_model = DDPG( 'MlpPolicy', env, verbose=1, n_episodes_rollout=rollout_size, learning_starts=3000, learning_rate=0.0001, action_noise=OrnsteinUhlenbeckActionNoise( mean=np.zeros(1), sigma=0.15 * np.ones(1), initial_noise=0.7 * np.ones(1)), tau=0.005, batch_size=128, tensorboard_log='tensorboard_ddpg', device='cuda', ) else: train_model = DDPG( 'MlpPolicy', env, verbose=1, n_episodes_rollout=rollout_size, learning_starts=1200, tensorboard_log='tensorboard_ddpg', learning_rate=0.0001, action_noise=OrnsteinUhlenbeckActionNoise( mean=np.zeros(1), sigma=0.15 * np.ones(1), initial_noise=0.7 * np.ones(1)), tau=0.005, batch_size=512, device='cpu', ) from tensorboard_baselines.callbacks_ddpg import TensorboardCallback train_model.learn( total_timesteps=num_steps, log_interval=2, eval_log_path='ddpg_log', eval_freq=2, eval_freq=10, #callback=[TensorboardCallback], ) print("Learning Process is Done.") return train_model
# A2C algorithm for i in range(n_tests): test_name = 'saved_models/a2c_soccer_actions_env_1_' + str(i) n_actions = env.action_space.shape[-1] model = A2C('MlpPolicy', env) model.learn(total_timesteps=25000, log_interval=1000) model.save(test_name) test_model(env, model, test_name) # DDPG algorithm for i in range(n_tests): test_name = 'saved_models/ddpg_soccer_actions_env_1_' + str(i) n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions)) model = DDPG('MlpPolicy', env, action_noise=action_noise) model.learn(total_timesteps=10000, log_interval=1000) model.save(test_name) test_model(env, model, test_name) for i in range(n_tests): test_name = 'saved_models/ddpg_soccer_actions_env_2_' + str(i) n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions)) policy_kwargs = dict(net_arch=[400, 300]) model = DDPG('MlpPolicy', env, action_noise=action_noise, policy_kwargs=policy_kwargs) model.learn(total_timesteps=10000, log_interval=1000) model.save(test_name) test_model(env, model, test_name) for i in range(n_tests):
import gym import panda_gym from stable_baselines3 import DDPG, HerReplayBuffer env = gym.make("PandaPush-v2") model = DDPG(policy="MultiInputPolicy", env=env, replay_buffer_class=HerReplayBuffer, verbose=1) model.learn(total_timesteps=100000)