def train(): best_reward, best_reward_timesteps = None, None save_path = "model_save/"+MODEL_PATH+"/" if save_path is not None: os.makedirs(save_path, exist_ok=True) # log_dir = f"model_save/" log_dir = save_path env, env_eval = ENV(util='train', par=PARAM, dt=DT), ENV(util='val', par=PARAM, dt=DT) env, env_eval = Monitor(env, log_dir), Monitor(env_eval, log_dir) env, env_eval = DummyVecEnv([lambda: env]), DummyVecEnv([lambda: env_eval]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) if PARAM['algo']=='td3': model = TD3('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'], learning_starts=PARAM['learning_starts']) elif PARAM['algo']=='ddpg': model = DDPG('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'], learning_starts=PARAM['learning_starts']) elif PARAM['algo']=='ppo': model = PPO('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed']) eval_callback = EvalCallback(env_eval, best_model_save_path=save_path+MODEL_PATH+'_best_model', log_path=log_dir, eval_freq=PARAM['eval_freq'], save_freq=PARAM['save_freq'], deterministic=True, render=False) model.learn(total_timesteps=int(PARAM['total_time_step']), callback=eval_callback, log_interval = 500) print("best mean reward:", eval_callback.best_mean_reward_overall, "timesteps:", eval_callback.best_mean_reward_timestep) model.save(save_path+MODEL_PATH+'_final_timesteps')
def train_DDPG(self, model_name, model_params = config.DDPG_PARAMS): """DDPG model""" from stable_baselines3 import DDPG from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise env_train = self.env n_actions = env_train.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1*np.ones(n_actions)) start = time.time() model = DDPG('MlpPolicy', env_train, batch_size=model_params['batch_size'], buffer_size=model_params['buffer_size'], action_noise=action_noise, verbose=model_params['verbose'], tensorboard_log = f"{config.TENSORBOARD_LOG_DIR}/{model_name}" ) model.learn(total_timesteps=model_params['timesteps'], tb_log_name = "DDPG_run") end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end-start)/60,' minutes') return model
def main(): """ # Example with Vectorized env num_cpu = 4 # Number of processes to use my_env_kwargs={'renders': False} env = make_vec_env('panda-ip-reach-v0', n_envs=num_cpu, env_kwargs=my_env_kwargs) """ # Example with a simple Dummy vec env env = gym.envs.make('panda-ip-reach-v0', renders=False) env = DummyVecEnv([lambda: env]) #check_env(pandaenv) # The noise objects for DDPG n_actions = env.action_space.shape[-1] print("n_actions = {0}".format(n_actions)) #action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = DDPG(policy='MlpPolicy', env=env, learning_rate=0.001, buffer_size=1000000, learning_starts=100, batch_size=100, tau=0.005, gamma=0.99, train_freq=1, gradient_steps=-1, action_noise=action_noise, optimize_memory_usage=False, tensorboard_log="./ddpg_panda_reach_tensorboard/", create_eval_env=False, policy_kwargs=None, verbose=1, seed=None, device='auto', _init_setup_model=True) """ print("start model evaluation without learning !") mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=1) print("end model evaluation !") """ print("start model learning !") model.learn(total_timesteps=200000, log_interval=10) print("end model learning !") print("-> model saved !!") model.save("ddpg_panda_reach") """ print("start model evaluation with learning !") mean_reward_after, std_reward_after = evaluate_policy(model, env, n_eval_episodes=1) print("end model evaluation !") """ """
def train_ddpg(): log_dir = f"model_save/" env = ENV(istest=False) env = Monitor(env, log_dir) env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) model = DDPG("CnnPolicy", env, policy_kwargs=policy_kwargs, verbose=1, batch_size=2048, seed=1, learning_starts=500000) callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir) model.learn(total_timesteps=int(1000000), callback = callback, log_interval = 480) model.save('model_save/ddpg_cnn')
def train_DDPG(env_train, model_name, timesteps=10000): """DDPG model""" # add the noise objects for DDPG n_actions = env_train.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) start = time.time() model = DDPG('MlpPolicy', env_train, action_noise=action_noise) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end-start)/60,' minutes') return model
def train_DDPG(env_train, model_name, timesteps=10000): """DDPG model""" # the noise objects for DDPG n_actions = env_train.action_space.shape[-1] # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) start = time.time() param_noise = None # removed keyword "param_noise=param_noise" stable_baselines3 doesn't need this one model = DDPG('MlpPolicy', env_train, action_noise=action_noise) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end - start) / 60, ' minutes') return model
def train(): log_dir = f"model_save/" env = ENV(istest=False) env = Monitor(env, log_dir) env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) model = DDPG('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'], learning_starts=PARAM['learning_starts']) callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir) model.learn(total_timesteps=int(PARAM['total_time_step']), callback=callback, log_interval=480) model.save('model_save/' + MODEL_PATH)
def train_DDPG(env): print(f"action space shape -1:{env.action_space.shape[-1]}") # The noise objects for TD3 n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.02 * np.ones(n_actions)) model = DDPG( 'MlpPolicy', env, learning_rate=0.0003, learning_starts=5, train_freq=10, n_episodes_rollout=-1, buffer_size=100000, action_noise=action_noise, batch_size=128, verbose=2, ) model.learn(total_timesteps=1000000, log_interval=1) model.save("DDPG_pkl")
for i in range(n_tests): test_name = 'saved_models/a2c_soccer_actions_env_1_' + str(i) n_actions = env.action_space.shape[-1] model = A2C('MlpPolicy', env) model.learn(total_timesteps=25000, log_interval=1000) model.save(test_name) test_model(env, model, test_name) # DDPG algorithm for i in range(n_tests): test_name = 'saved_models/ddpg_soccer_actions_env_1_' + str(i) n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions)) model = DDPG('MlpPolicy', env, action_noise=action_noise) model.learn(total_timesteps=10000, log_interval=1000) model.save(test_name) test_model(env, model, test_name) for i in range(n_tests): test_name = 'saved_models/ddpg_soccer_actions_env_2_' + str(i) n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions)) policy_kwargs = dict(net_arch=[400, 300]) model = DDPG('MlpPolicy', env, action_noise=action_noise, policy_kwargs=policy_kwargs) model.learn(total_timesteps=10000, log_interval=1000) model.save(test_name) test_model(env, model, test_name) for i in range(n_tests): test_name = 'saved_models/ddpg_soccer_actions_env_3_' + str(i) n_actions = env.action_space.shape[-1]
#### Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) #### Train the model ############################################################################### model = DDPG(CustomPolicy, env, verbose=1, batch_size=64, action_noise=action_noise) for i in range(step_iters): # run for step_iters * training_timesteps model.learn(total_timesteps=training_timesteps) model.save("./models/ddpg" + str((i + 1) * training_timesteps)) model.save_replay_buffer("./experiences/ddpg_experience" + str((i + 1) * training_timesteps)) #### Show (and record a video of) the model's performance ########################################## env_test = RLTetherAviary(gui=False, record=True) obs = env_test.reset() start = time.time() for i in range(10 * env_test.SIM_FREQ): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env_test.step(action) if done: break env_test.close() env.close()
# HyperParameters lr = 3e-4 model_name = "DDGP_2.pt" model_path = "./Model/" + model_name # Instantiate Model n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=.75 * np.ones(n_actions)) model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1) # Train OR Load Model model.learn(total_timesteps=25000) model.save(model_path) mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print("Mean Reward = ", mean_reward) print(env.soc_list) epsi_sp_list = [] action_list = [] soc_list = [] Concentration_list = [] Concentration_list1 = []
action_noise=action_noise, verbose=1, tensorboard_log="./ddpg_pendulum_tensorboard/") print("start model evaluation without learning !") mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=100) print("end model evaluation !") print("start model learning !") model.learn(total_timesteps=10000, log_interval=10) print("end model learning !") print("-> model saved !!") model.save("ddpg_pendulum") print("start model evaluation with learning !") mean_reward_after, std_reward_after = evaluate_policy(model, env, n_eval_episodes=100) print("end model evaluation !") print("-> model evaluation without learning") print( f"mean_reward:{mean_reward_before:.2f} +/- std_reward:{std_reward_before:.2f}" ) print("-> model evaluation with learning") print( f"mean_reward:{mean_reward_after:.2f} +/- std_reward:{std_reward_after:.2f}"
import kukakr5Arc env = gym.make('kukakr5Arc-v1') # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise) model = DDPG(MlpPolicy, env, verbose=1, action_noise=action_noise) model.learn(total_timesteps=400000) model.save( "/home/nightmareforev/git/bullet_stuff/multi_kuka_sim/kukakr5Arc/envs/saved_policies/kukakr5Arc_reacher" ) print('Saving model.... Model saved') del model # remove to demonstrate saving and loading model = DDPG.load( "/home/nightmareforev/git/bullet_stuff/multi_kuka_sim/kukakr5Arc/envs/saved_policies/kukakr5Arc_reacher", env=env) print('Loading model.....Model loaded') #env.render() goes before env.reset() for the render to work #env.render() #obs = env.reset() #while True:
env_id = 'gym_spm:spm-v0' num_cpu = 4 # Number of processes to use env = gym.make('gym_spm:spm-v0') # env = make_vec_env(env_id, n_envs=1, seed=0) # env = VecCheckNan(env, raise_exception=True) # env = check_env(env) # The noise objects for DDPG n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=25.67 * np.ones(n_actions)) # model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./TD3_spm_v2_SOC_point5_two_state/") model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./DDPG_spm_v2_SOC_point5_two_state/") model.learn(total_timesteps=25000, tb_log_name='DDPG_test_run_3_SOCpoint5_two_state') model.save('DDPG_test_3_SOC_point5_two_states') model.load('DDPG_test_2_SOC_point5_two_states') mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print("Mean Reward = ", mean_reward) epsi_sp_list = [] action_list = [] soc_list = [] Concentration_list = [] Concentration_list1 = [] obs = env.reset() for _ in range(3600):
model_dir_description = model_dir + details # Instantiate Model n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=25.67 * np.ones(n_actions)) model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log=log_dir) # Train OR Load Model if train_model: model.learn(total_timesteps=25000, tb_log_name=details) model.save(model_dir_description) else: model.load(model_dir_description) mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print("Mean Reward = ", mean_reward) epsi_sp_list = [] action_list = [] soc_list = [] Concentration_list = [] Concentration_list1 = []
env_id = 'gym_spm:spm-v0' num_cpu = 4 # Number of processes to use train = True env = gym.make('gym_spm:spm-v0') # The noise objects for DDPG n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=25.67 * np.ones(n_actions)) model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./DDPG_spm_v2_SOC_point5_two_state/") # model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./TD3_spm_v2_SOC_point5_two_state/") if train: model.learn(total_timesteps=2500000, tb_log_name='test_run_3_SOCpoint5_two_state') model.save('TD3_test_3_SOC_point5_two_states') else: model.load('TD3_test_2_SOC_point5_two_states') mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print("Mean Reward = ", mean_reward) epsi_sp_list = [] action_list = [] soc_list = [] Concentration_list = [] Concentration_list1 = [] obs = env.reset() for _ in range(3600):