def test_ddpg_eval_env(): """ Additional test to check that everything is working when passing an eval env. """ eval_env = gym.make("Pendulum-v0") model = DDPG("MlpPolicy", "Pendulum-v0", nb_rollout_steps=5, nb_train_steps=2, nb_eval_steps=10, eval_env=eval_env, verbose=0) model.learn(1000)
def test_ddpg_popart(): """ Test DDPG with pop-art normalization """ n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = DDPG('MlpPolicy', 'Pendulum-v0', memory_limit=50000, normalize_observations=True, normalize_returns=True, nb_rollout_steps=128, nb_train_steps=1, batch_size=64, action_noise=action_noise, enable_popart=True) model.learn(1000)
def train_agent(train, pickle_file, agent_type, env_kwargs, parms): bin_path = "bin/" + pickle_file if (path.exists(bin_path)): if agent_type == "a2c": print("Loading A2C Agent") RL_model = A2C.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") elif agent_type == "ddpg": print("Loading DDPG Agent") RL_model = DDPG.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") elif agent_type == "ppo": print("Loading PPO2 Agent") RL_model = PPO2.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") else: e_train_gym = ipenv.PortfolioAllocEnv(df=train, **env_kwargs) env_train, _ = e_train_gym.get_sb_env() agent = ipagent.IPRLAgent(env=env_train) model = agent.get_model(model_name=agent_type, model_kwargs=parms) RL_model = agent.train_model(model=model, tb_log_name=agent_type, total_timesteps=1000000) RL_model.save(bin_path) return RL_model
def test_ddpg_normalization(): """ Test that observations and returns normalizations are properly saved and loaded. """ param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=0.05) model = DDPG('MlpPolicy', 'Pendulum-v0', memory_limit=50000, normalize_observations=True, normalize_returns=True, nb_rollout_steps=128, nb_train_steps=1, batch_size=64, param_noise=param_noise) model.learn(1000) obs_rms_params = model.sess.run(model.obs_rms_params) ret_rms_params = model.sess.run(model.ret_rms_params) model.save('./test_ddpg') loaded_model = DDPG.load("test_ddpg") obs_rms_params_2 = loaded_model.sess.run(loaded_model.obs_rms_params) ret_rms_params_2 = loaded_model.sess.run(loaded_model.ret_rms_params) for param, param_loaded in zip(obs_rms_params + ret_rms_params, obs_rms_params_2 + ret_rms_params_2): assert np.allclose(param, param_loaded) del model, loaded_model if os.path.exists("./test_ddpg"): os.remove("./test_ddpg")
import gym from stable_baselines.ddpg import LnMlpPolicy from stable_baselines.ddpg import DDPG env = gym.make('HalfCheetah-v3') model = DDPG(LnMlpPolicy, env,gamma=.95,buffer_size=1000000,param_noise_adaption_interval=0.22,batch_size=256, normalize_observations=True,normalize_returns=False, policy_kwargs= dict(layers=[400, 300]) ,verbose=1) model.learn(total_timesteps=1000000) model.save('Cheetah_model_DDPG') obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() env.close()
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): """ run the training of DDPG :param env_id: (str) the environment ID :param seed: (int) the initial random seed :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by seperating them with commas :param layer_norm: (bool) use layer normalization :param evaluation: (bool) enable evaluation of DDPG training :param kwargs: (dict) extra keywords for the training.train function """ # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. start_time = 0 if rank == 0: start_time = time.time() model = DDPG(policy=MlpPolicy, env=env, memory_policy=Memory, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, memory_limit=int(1e6), layer_norm=layer_norm, verbose=2, **kwargs) model.learn(total_timesteps=10000) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def main(env, load_path, fig_path): # arguments print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path)) log_path = os.getcwd() + "/log/" + load_path os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True) fig_path = os.getcwd() + "/figs/" + "/" + fig_path load_path = os.getcwd() + "/models/" + load_path # make environment, flattened environment, vectorized environment env = gym.make(env) env = gym.wrappers.FlattenDictWrapper(env, ['observation', 'achieved_goal', 'desired_goal']) env = DummyVecEnv([lambda: env]) # load model model = DDPG.load(load_path, env=env) obs_initial = env.reset() obs = obs_initial # plot results plot_results(fig_path, log_path) # initializations niter = 10 counter = 0 timestep = 0 results = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] print("==============================") # check initial positions and quaternions print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip')) print("box", env.envs[0].env.env.sim.data.get_site_xpos('box')) print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool')) print("mocap", env.envs[0].env.env.sim.data.mocap_pos) print("quat", env.envs[0].env.env.sim.data.mocap_quat) print("==============================") # mocap quaternion check for i in range(5): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) quat = env.envs[0].env.env.sim.data.mocap_quat print("obs", obs) print("quat", quat) print("==============================") # start rendering dists = [] box_goal_pos = np.array([0.6, 0.05, -0.17]) while True: if counter == niter: break action, _states = model.predict(obs) obs_old = obs obs, rewards, dones, info = env.step(action) quaternion = env.envs[0].env.env.sim.data.mocap_quat if obs.all() == obs_initial.all(): if counter % 10 == 0: xyzs = current[0] quats = current[1] print(xyzs) print(quats) filename = log_path + "/" + "results_" + str(counter) + ".txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() box_end_pos = np.array(obs_old[0][3:6].tolist()) print(box_end_pos) print(np.shape(box_end_pos)) print(box_goal_pos) print(np.shape(box_goal_pos)) dists.append(np.linalg.norm(box_goal_pos - box_end_pos)) current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] timestep = 0 counter += 1 print(timestep) print("obs", obs) print("quat", quaternion) # for average trajectory, smoothed for i in range(3): results[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): results[1][timestep][j] += quaternion[0].tolist()[j] # for current trajectory for i in range(3): current[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): current[1][timestep][j] += quaternion[0].tolist()[j] timestep += 1 env.render() # smooth paths by taking average, and calculate mean distance to goal state for timestep in range(100): for i in range(3): results[0][timeste][i] /= niter for j in range(4): results[0][timestep][j] /= niter dist = np.mean(dists) # print and write to file xyzs = results[0] quats = results[1] filename = log_path + "/" + "results_avg.txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() # print average distances print("average distance of box from end goal: %f" % dist)
def main(env, load, save_path, load_path=None, train_timesteps=1.25e6, eval_timesteps=5e3): # arguments print( "env %s; load %s; save_path %s; load_path %s; train_timesteps %s; eval_timesteps %s;" % (env, load, save_path, load_path, train_timesteps, eval_timesteps)) train_timesteps = int(float(train_timesteps)) eval_timesteps = int(float(eval_timesteps)) # models path model_dir = os.getcwd() + "/models/" os.makedirs(model_dir, exist_ok=True) # logging path log_dir = os.getcwd() + "/log/" + save_path os.makedirs(log_dir, exist_ok=True) # absolute save path and models path save_path = model_dir + save_path if load and not load_path: print("no load path given, exiting...") sys.exit() elif load: load_path = model_dir + load_path # make environment, flattened environment, monitor, vectorized environment env = gym.make(env) env = gym.wrappers.FlattenDictWrapper( env, ['observation', 'achieved_goal', 'desired_goal']) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # load model, or start from scratch if load: print("loading model from: " + load_path) model = DDPG.load(load_path, env=env) else: print("training model from scratch") model = DDPG(MlpPolicy, env, verbose=1) # evaluate current model mean_reward_before_train = evaluate(model, env, num_steps=eval_timesteps) # train model global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 model.learn(total_timesteps=train_timesteps, callback=None) # save model print("saving model to:" + save_path) model.save(save_path) # evaluate post training model mean_reward_after_train = evaluate(model, env, num_steps=eval_timesteps) # results print("reward before training:" + str(mean_reward_before_train)) print("reward after training:" + str(mean_reward_after_train)) print("done")
def train(env_id, num_timesteps, seed, model_path=None, images=False): """ Train PPO2 model for Mujoco environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. """ def make_env(): if images: env_out = GymWrapper( suite.make( "SawyerLift", use_object_obs=False, use_camera_obs=True, # do not use pixel observations has_offscreen_renderer= True, # not needed since not using pixel obs has_renderer=False, # make sure we can render to the screen camera_depth=True, reward_shaping=True, # use dense rewards control_freq= 10, # control should happen fast enough so that simulation looks smooth render_visual_mesh=False, ), keys=["image", "depth"], images=True, ) else: env_out = GymWrapper( suite.make( "SawyerLift", use_object_obs=True, use_camera_obs=False, # do not use pixel observations has_offscreen_renderer= False, # not needed since not using pixel obs has_renderer=False, # make sure we can render to the screen camera_depth=False, reward_shaping=True, # use dense rewards control_freq= 10, # control should happen fast enough so that simulation looks smooth render_visual_mesh=False, ) #, keys=["image", "depth"], images=True, ) env_out.reward_range = None env_out.metadata = None env_out.spec = None env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out #env = make_env() if images: env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = CnnPolicy tblog = "/cvgl2/u/surajn/workspace/tb_logs/sawyerlift_all/" else: env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy tblog = "/cvgl2/u/surajn/workspace/tb_logs/sawyerlift_all/" nb_actions = env.action_space.shape[-1] #model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, # ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, verbose=1, tensorboard_log=tblog) #model = TRPO(policy=policy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, # gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, tensorboard_log=tblog, verbose=1) model = DDPG(policy=ddpgMlpPolicy, env=env, memory_policy=Memory, eval_env=None, param_noise=AdaptiveParamNoiseSpec(initial_stddev=0.2, desired_action_stddev=0.2), action_noise=OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(0.2) * np.ones(nb_actions)), memory_limit=int(1e6), verbose=2, tensorboard_log=tblog) model.learn(total_timesteps=num_timesteps) env.close() if model_path: model.save(model_path) #tf_util.save_state(model_path) return model, env
def run(env_id, seed, layer_norm, evaluation, agent, delay_step, gamma=0.99, **kwargs): # Create envs. env = create_env(env_id, delay_step, str(0)) print(env.observation_space, env.action_space) if evaluation: eval_env = create_env(env_id, delay_step, "eval_env") else: eval_env = None # Seed everything to make things reproducible. logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. start_time = time.time() policy = 'MlpPolicy' td3_variants = { "TD3": TD3, "TD3SIL": TD3SIL, "TD3NSTEP": TD3NSTEP, "TD3REDQ": TD3REDQ, "TD3DoubleTwin": TD3DoubleTwin, } if td3_variants.get(agent, None): model_func = td3_variants[agent] model = model_func(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128, tau=0.005, policy_delay=2, learning_starts=25000, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2, n_cpu_tf_sess=10, policy_kwargs={"layers": [400, 300]}) elif agent == "DDPG": model = DDPG(policy=policy, env=env, eval_env=eval_env, gamma=gamma, nb_eval_steps=5, batch_size=100, nb_train_steps=100, nb_rollout_steps=100, learning_starts=10000, actor_lr=1e-3, critic_lr=1e-3, critic_l2_reg=0, tau=0.005, normalize_observations=False, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=int(1e6), verbose=2, n_cpu_tf_sess=10, policy_kwargs={"layers": [400, 300]}) elif agent == "SAC": model = SAC(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=256, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=int(1e6), verbose=2, n_cpu_tf_sess=10, learning_starts=10000, policy_kwargs={"layers": [256, 256]}) elif agent == "GEM": policy = 'TD3LnMlpPolicy' model = TD3MemGEM(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128, tau=0.005, policy_delay=2, learning_starts=25000, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2, n_cpu_tf_sess=10, alpha=0.5, beta=-1, iterative_q=-1, num_q=4, gradient_steps=200, max_step=kwargs['max_steps'], reward_scale=1., nb_eval_steps=10, policy_kwargs={"layers": [400, 300]}) elif agent == "BP": policy = 'TD3LnMlpPolicy' model = TD3MemBackProp(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128, tau=0.005, policy_delay=2, learning_starts=25000, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2, n_cpu_tf_sess=10, alpha=0.5, beta=-1, gradient_steps=200, max_step=kwargs['max_steps'], reward_scale=1., nb_eval_steps=10, policy_kwargs={"layers": [400, 300]}) else: raise NotImplementedError print("model building finished") model.learn(total_timesteps=kwargs['num_timesteps']) env.close() if eval_env is not None: eval_env.close() logger.info('total runtime: {}s'.format(time.time() - start_time))