def train(params, model=None, path=None): if model: # indicate in filename that this is a finetune if params['name']: params['name'] += '_Finetune' else: params['name'] = 'Finetune' data_dir, tb_path = get_paths(params, path=path) print("Training Parameters: ", params) os.makedirs(data_dir, exist_ok=True) # Save parameters immediatly params.save(data_dir) rank = mpi_rank_or_zero() if rank != 0: logger.set_level(logger.DISABLED) def make_env(i): env = get_env(params) env = Monitor(env, data_dir + '/' + str(i), allow_early_resets=params['early_reset']) return env use_her = params['env_args']['use_her'] if 'use_her' in params['env_args'] else False if use_her: env = make_env(0) goal_selection_strategy = 'future' else: env = DummyVecEnv([(lambda n: lambda: make_env(n))(i) for i in range(params['num_proc'])]) if model: # indicate in filename that this is a finetune print("Model action space", model.action_space, model.action_space.low) print("Env action space", env.action_space, env.action_space.low) if params['normalize']: env = VecNormalize(env) if params['seed']: seed = params['seed'] + 100000 * rank set_global_seeds(seed) params['alg_args']['seed'] = seed if 'noise' in params and params['noise']: from stable_baselines.ddpg import OrnsteinUhlenbeckActionNoise n_actions = env.action_space.shape[-1] params['alg_args']['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(params['noise'])*np.ones(n_actions)) if model is None: alg = get_alg(params) policy = get_policy(params) if use_her: from stable_baselines import HER model = HER(policy, env, alg, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, tensorboard_log=tb_path, policy_kwargs=params['policy_args'], **params['alg_args']) else: model = alg(policy, env, verbose=1, tensorboard_log=tb_path, policy_kwargs=params['policy_args'], **params['alg_args']) else: model.set_env(env) model.learn(total_timesteps=params['timesteps'], log_interval=params['log_interval'], callback=create_training_callback(data_dir, freq=params['eval_freq'], checkpoint_freq=params['checkpoint_freq'])) print("######## SAVING MODEL TO", data_dir) model.save(data_dir +'/final_model') if params['normalize']: env.save(data_dir + '/normalized_environment.env') env.close()
print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(log_dir + 'best_model.pkl') env.envs[0].save(os.path.join(log_dir, "vec_normalize.pkl")) n_steps += 1 # Returning False will stop training early return True # env_s= lambda: gym.make("HopperEnvRep-v0") # env_s = Monitor(env_s, log_dir, allow_early_resets=True) #env.act_rep = 20 model = SAC(MlpPolicy, env, verbose=1) #model = PPO2(MlpPolicy, env,verbose=True) model.learn(total_timesteps=1000, use_action_repeat=True, callback=callback) f.close() # Don't forget to save the VecNormalize statistics when saving the agent # log_dir = "logs/hopper_aneal/" model.save(log_dir + "sac_hopper") env.save(os.path.join(log_dir, "vec_normalize.pkl"))
def train_hrl(low_params, high_params, high_training_starts=0, model=None, path=None): if model: # indicate in filename that this is a finetune if low_params['name']: low_params['name'] += '_Finetune' else: low_params['name'] = 'Finetune' if high_params['name']: high_params['name'] += '_Finetune' else: high_params['name'] = 'Finetune' params = merge_hrl_params(low_params, high_params) data_dir, tb_path = get_paths(params, path=path) data_dir_components = data_dir.split('_') data_dir_components.insert(-1, 'Low') low_data_dir = '_'.join(data_dir_components) data_dir_components[-2] = 'High' high_data_dir = '_'.join(data_dir_components) os.makedirs(high_data_dir, exist_ok=True) os.makedirs(low_data_dir, exist_ok=True) # Enforce consistency across params by using the split function. low_params, high_params = split_hrl_params(params) print("HRL PARAMS") print("High Params", high_params) print("low Params", low_params) high_params['env_wrapper_args']['policy'] = '/'.join(low_data_dir.split('/')[-2:]) low_params.save(low_data_dir) high_params.save(high_data_dir) def make_env(i): env = get_env(params) print("ENVIRONMENT", env) env = Monitor(env, high_data_dir + '/' + str(i), allow_early_resets=params['early_reset'], info_keywords=('low_ep_info',)) return env env = DummyVecEnv([(lambda n: lambda: make_env(n))(i) for i in range(params['num_proc'])]) if params['normalize']: env = VecNormalize(env) seed = params['seed'] if seed: set_global_seeds(seed) params['alg_args']['seed'] = seed if model is None: alg = get_alg(params) policy = get_policy(params) model = alg(policy, policy, env, verbose=1, tensorboard_log=tb_path, high_policy_kwargs=params['high_policy_args'], low_policy_kwargs=params['low_policy_args'], **{'low_' + key : value for key, value in params['low_alg_args'].items()}, **{'high_' + key : value for key, value in params['high_alg_args'].items()}) else: model.set_env(env) model.learn(total_timesteps=params['timesteps'], log_interval=int(params['log_interval']/4), callback=create_training_callback(high_data_dir, low_level_data_dir=low_data_dir, freq=params['eval_freq'], checkpoint_freq=params['checkpoint_freq']), high_training_starts=high_training_starts) model.save(low_data_dir +'/final_model', high_data_dir + '/final_model') if params['normalize']: env.save(data_dir + '/normalized_environment.env') env.close()
plt.plot(x, y) plt.xlabel('Number of Timesteps') plt.ylabel('Rewards') plt.title(title + " Learning Curve Smoothed") plt.show() if __name__ == "__main__": rospy.init_node('drone_gym') env_id = 'Crazyflie-v0' log_dir = 'models/hover/empty_world_small/finalVec' env = DummyVecEnv([lambda: gym.make(env_id)]) # Automatically normalize the input features and reward env = VecNormalize(env, norm_obs=True, norm_reward=True) # # Save best model every n steps and monitors performance # save_best_callback = SaveOnBestTrainingRewardCallback(check_freq=5, log_dir=log_dir) # # Save model every n steps # checkpoint_callback = CheckpointCallback(save_freq=5, save_path='./' + log_dir, name_prefix='ppo2') # Train from scratch model = PPO2(MlpPolicy, env, verbose=1) model.learn(total_timesteps=80000) # model.learn(total_timesteps=20, callback=[save_best_callback, checkpoint_callback]) # Don't forget to save the VecNormalize statistics when saving the agent model.save(log_dir + "/ppo2_final") stats_path = os.path.join(log_dir, "vec_normalize.pkl") env.save(stats_path)