def train_policy(num_of_envs, log_relative_path, maximum_episode_length, skip_frame, seed_num, her_config, total_time_steps, validate_every_timesteps, task_name): task = generate_task(task_generator_id=task_name, dense_reward_weights=np.array([100000, 0, 0, 0]), fractional_reward_weight=0) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=seed_num, max_episode_length=maximum_episode_length) env = HERGoalEnvWrapper(env) env = CurriculumWrapper( env, intervention_actors=[GoalInterventionActorPolicy()], actives=[(0, 1000000000, 1, 0)]) set_global_seeds(seed_num) checkpoint_callback = CheckpointCallback(save_freq=int( validate_every_timesteps / num_of_envs), save_path=log_relative_path, name_prefix='model') model = HER(MlpPolicy, env, SAC, verbose=1, policy_kwargs=dict(layers=[256, 256, 256]), **her_config, seed=seed_num) model.learn(total_timesteps=total_time_steps, tb_log_name="her_sac", callback=checkpoint_callback) return
def main(load_policy=True): global log_dir model_class = TD3 # works also with SAC and DDPG action_space = 6 gamma = 0.9 memory_limit = 1000000 timesteps = 15000000 discreteAction = 0 rend = False # learning rate env = bioEnv() env = Monitor(env, log_dir, allow_early_resets=True) goal_selection_strategy = 'future' n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomTD3Policy, env, model_class,n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/bioEnv_TD3", buffer_size=1000000,batch_size= 256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("models/TD3/curriculum/best_model_part_11_10g_TRUE.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/bioEnv_TD3", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) model.learn(timesteps,log_interval=100, callback = callback) model.save("policy_TD3_Discr")
def test_long_episode(model_class): """ Check that the model does not break when the replay buffer is still empty after the first rollout (because the episode is not over). """ # n_bits > nb_rollout_steps n_bits = 10 env = BitFlippingEnv(n_bits, continuous=model_class in [DDPG, SAC, TD3], max_steps=n_bits) kwargs = {} if model_class == DDPG: kwargs['nb_rollout_steps'] = 9 # < n_bits elif model_class in [DQN, SAC, TD3]: kwargs['batch_size'] = 8 # < n_bits kwargs['learning_starts'] = 0 model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy='future', verbose=0, **kwargs) model.learn(100)
def main(argv): numControlledJoints = 6 fixed = False normalize_observations = False gamma = 0.9 batch_size = 16 memory_limit = 1000000 normalize_returns = True timesteps = 1000000 policy_name = "reaching_policy" discreteAction = 0 rend = False kukaenv = kukaReachGymEnvHer(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True) kukaenv = Monitor(kukaenv, log_dir, allow_early_resets=True) n_actions = kukaenv.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model_class = DDPG goal_selection_strategy = 'future' model = HER(CustomPolicy, kukaenv, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, tensorboard_log= "../pybullet_logs/kuka_reach_ddpg/reaching_DDPG_HER_PHASE", buffer_size=1000000, batch_size=64, random_exploration=0.3, action_noise=action_noise) print(colored("-----Timesteps:", "red")) print(colored(timesteps, "red")) print(colored("-----Number Joints Controlled:", "red")) print(colored(numControlledJoints, "red")) print(colored("-----Object Position Fixed:", "red")) print(colored(fixed, "red")) print(colored("-----Policy Name:", "red")) print(colored(policy_name, "red")) print(colored("------", "red")) print(colored("Launch the script with -h for further info", "red")) model.learn(total_timesteps=timesteps, log_interval=100, callback=callback) print("Saving model to kuka.pkl") model.save("../pybullet_logs/kukareach_ddpg_her/" + policy_name) del model # remove to demonstrate saving and loading
def train_policy(num_of_envs, log_relative_path, maximum_episode_length, skip_frame, seed_num, sac_config, total_time_steps, validate_every_timesteps, task_name): def _make_env(rank): def _init(): task = generate_task(task_generator_id=task_name) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=seed_num + rank, max_episode_length=maximum_episode_length) env = HERGoalEnvWrapper(env) return env set_global_seeds(seed_num) return _init os.makedirs(log_relative_path) env = SubprocVecEnv([_make_env(rank=i) for i in range(num_of_envs)]) model = HER('MlpPolicy', env, SAC, verbose=1, policy_kwargs=dict(layers=[256, 256, 256]), **sac_config) save_config_file(sac_config, _make_env(0)(), os.path.join(log_relative_path, 'config.json')) for i in range(int(total_time_steps / validate_every_timesteps)): model.learn(total_timesteps=validate_every_timesteps, tb_log_name="sac", reset_num_timesteps=False) model.save(os.path.join(log_relative_path, 'saved_model')) return
def main( training_env: PSMCartesianHERDDPGEnv, eval_env: PSMCartesianHERDDPGEnv = None, log_dir='./.logs/results' ): os.makedirs(log_dir, exist_ok=True) # training_env = Monitor(training_env, log_dir) n_actions = training_env.action_space.shape[0] noise_std = 0.2 # Currently using OU noise action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions) ) model_class = DDPG # works also with SAC, DDPG and TD3 rl_model_kwargs = { 'actor_lr': 1e-3, 'critic_lr': 1e-3, 'action_noise': action_noise, 'nb_train_steps': 300, 'nb_rollout_steps': 100, 'gamma': 0.95, 'observation_range': (-1.5, 1.5), 'random_exploration': 0.05, 'normalize_observations': True, 'critic_l2_reg': 0.01 } # Available strategies (cf paper): future, final, episode, random model = HER( 'MlpPolicy', training_env, model_class, verbose=1, n_sampled_goal=4, goal_selection_strategy='future', buffer_size=int(1e5), batch_size=128, tensorboard_log="./ddpg_dvrk_tensorboard/", **rl_model_kwargs ) # Reset the model training_env.reset() # Create callbacks checkpoint_callback = CheckpointCallback( save_freq=100000, save_path="./ddpg_dvrk_tensorboard/" ) # save_path="./.model/model_checkpoint/") #save_freq=100000 # eval_callback = EvalCallback(training_env, best_model_save_path='./ddpg_dvrk_tensorboard/best_model', # log_path=log_dir, eval_freq=500) callback = CallbackList([checkpoint_callback]) # , eval_callback]) # Train the model model.learn(4000000, log_interval=100, callback=callback) model.save("./her_robot_env")
def main(load_policy=False): global log_dir model_class = TD3 # works also with SAC and DDPG action_space = 7 normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 8000000 rend = False obj_pose_rnd_std = 0 env = pandaPushGymGoalEnv(renders=rend, use_IK=0, numControlledJoints=action_space, obj_pose_rnd_std=obj_pose_rnd_std, includeVelObs=True) env = Monitor(env, log_dir, allow_early_resets=True) goal_selection_strategy = 'future' n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER( CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, tensorboard_log= "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed", buffer_size=1000000, batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load( "../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log= "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed", buffer_size=1000000, batch_size=256, random_exploration=0.3, action_noise=action_noise) model.learn(timesteps, log_interval=100, callback=callback) print("Saving Policy PHASE_1") model.save("../policies/TD3_phase1_target_fixed")
def test_her(model_class, goal_selection_strategy, discrete_obs_space): env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS, discrete_obs_space=discrete_obs_space) # Take random actions 10% of the time kwargs = {'random_exploration': 0.1} if model_class in [DDPG, SAC, TD3] else {} model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=0, **kwargs) model.learn(1000)
def main(): model_class = DDPG # works also with SAC and DDPG # -j action_space = 7 # -p fixed = True # -o normalize_observations = False # -g gamma = 0.9 # -b #batch_size = 16 # -m memory_limit = 1000000 # -r normalize_returns = True # -t timesteps = 1000000 policy_name = "pushing_policy" discreteAction = 0 rend = False env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, action_space=action_space, fixedPositionObj=fixed, includeVelObs=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER( CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, tensorboard_log= "../pybullet_logs/panda_push_ddpg/stable_baselines/DDPG+HER_FIXED_DYN_RAND", buffer_size=1000000, batch_size=256, random_exploration=0.3, action_noise=action_noise) # Train the model starting from a previous policy model.learn(timesteps) print("Saving Policy") model.save("../policies/pushing_fixed_HER_Dyn_Rand")
def launchAgent(model_name: str): """ :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함 현재는 의도상 PPO2로 세팅할 것 :return: 1000회의 사이클을 돌고 난 이후의 모델 """ import Reinforcement_AI.env.e_enhanced_image_env as image_env from stable_baselines import DQN, HER, DDPG, PPO2 from stable_baselines.common import make_vec_env print("Current Env is " + model_name) if model_name == "HER": env = image_env.DetailedMiniMapEnv() model = HER("CnnPolicy", env=env, model_class=DQN) if model_name == "DDPG": env = image_env.DDPGImageEnv() model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True) if model_name == "PPO2": env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1) model = PPO2(policy="CnnPolicy", env=env, verbose=1) else: env = image_env.DetailedMiniMapEnv() model = DQN( "CnnPolicy", # policy env=env, # environment double_q=True, # Double Q enable prioritized_replay=True, # Replay buffer enabled verbose=0 # log print ) for i in range(1000): if i != 0: if model_name == "HER": model = HER.load("detailedmap_HER_" + str(i), env) if model_name == "DDPG": model = DDPG.load("detailedmap_DDPG_" + str(i), env) if model_name == "PPO2": model = PPO2.load("detailedmap_PPO2_" + str(i), env) else: model = DQN.load("detailedmap_DQN_" + str(i), env) # print('model learn start') model.learn(total_timesteps=12500) #FPS가 130이상 넘어갈때의 최소수치 print("this model is : detailedmap_" + model_name + "_" + str(i + 1)) # print('model learn finished') # print('model save start') model.save("detailedmap_" + model_name + "_" + str(i + 1)) del model # print('model save end') return model
def heralgorithm(): goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER('MlpPolicy', env1, DDPG, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1) # Train the model model.learn(1000) model.save("./her_bit_env")
def train_HER(env_train, model_name, timesteps=50000): start = time.time() n_sampled_goal = 4 goal_selection_strategy = 'future' model = HER('MlpPolicy', env_train, model_class=SAC, verbose=0, n_sampled_goal=n_sampled_goal, goal_selection_strategy=goal_selection_strategy) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (HER): ', (end - start) / 60, ' minutes') return model
def main(env): n_actions = env.action_space.shape[0] noise_std = 0.2 # Currently using OU noise action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) model_class = DDPG # works also with SAC, DDPG and TD3 rl_model_kwargs = { 'actor_lr': 1e-3, 'critic_lr': 1e-3, 'action_noise': action_noise, 'nb_train_steps': 300, 'nb_rollout_steps': 100, 'gamma': 0.95, 'observation_range': (-1.5, 1.5), 'random_exploration': 0.05, 'normalize_observations': True, 'critic_l2_reg': 0.01 } # Available strategies (cf paper): future, final, episode, random model = HER('MlpPolicy', env, model_class, verbose=1, n_sampled_goal=4, goal_selection_strategy='future', buffer_size=int(1e5), batch_size=128, tensorboard_log="./ddpg_dvrk_tensorboard/", **rl_model_kwargs) # Reset the model env.reset() # Train the model model.learn(4000000, log_interval=100, callback=CheckpointCallback( save_freq=100000, save_path="./ddpg_dvrk_tensorboard/")) model.save("./her_robot_env")
def main(load_policy=False): global log_dir, log_dir_policy if (load_policy): log_dir_policy = '../policies/PUSHING_TD3+HER_FIXED_POSITION_DYN_RAND_FROM_FIXED_PHYSICS' model_class = TD3 # works also with SAC and DDPG action_space = 7 fixed = True normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 1500000 discreteAction = 0 rend = False env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, action_space = action_space, fixedPositionObj = fixed, includeVelObs = True) env = Monitor(env, log_dir, allow_early_resets=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND", buffer_size=1000000,batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND_FROM_FIXED_PHYSICS", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) # Train the model starting from a previous policy model.learn(timesteps, callback = callback ) model.save("../policies/PUSHING_FIXED_TD3_DYN_RAND") print("Finished train1")
class HERSACAgent(Agent): name = "her-sac" def __init__(self, env: ISettableGoalEnv, verbose=1, rank=0, experiment_name="her-sac"): self._env = env self._dirs = Dirs( experiment_name=f"{type(env).__name__}-{experiment_name}", rank=rank) options = { "env": env, "tensorboard_log": self._dirs.tensorboard, "model_class": SAC, "gamma": 1, "learning_rate": 3e-3 } if os.path.isdir(self._dirs.models) and os.path.isfile( self._dirs.best_model): self._model = HER.load(load_path=self._dirs.best_model, **options) print(f"Loaded model {self._dirs.best_model}") else: self._model = HER(policy="MlpPolicy", verbose=verbose, **options) def __call__(self, obs: Observation) -> np.ndarray: action, _ = self._model.predict(obs, deterministic=True) return action def train(self, timesteps: int, callbacks: Sequence[BaseCallback] = None, num_checkpoints=4) -> None: callbacks = [] if callbacks is None else callbacks cb = CheckpointCallback(save_freq=timesteps // num_checkpoints, save_path=self._dirs.models, name_prefix=self._dirs.prefix) self._model.learn(total_timesteps=timesteps, callback=CallbackList([cb, *callbacks]))
def main(load_policy=False): global log_dir model_class = TD3 # works also with SAC and DDPG action_space = 6 fixed = True #0 completely fixed, 1 slightly random radius, 2 big random radius, object_position = 1 normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 5000000 discreteAction = 0 rend = False env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=1, isDiscrete=discreteAction, action_space = action_space, fixedPositionObj = fixed, includeVelObs = True, object_position=object_position) env = Monitor(env, log_dir, allow_early_resets=True) goal_selection_strategy = 'future' n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) model.learn(timesteps,log_interval=100, callback = callback) print("Saving Policy PHASE_1") model.save("../policies/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK")
def test_model_manipulation(model_class, goal_selection_strategy): env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) env = DummyVecEnv([lambda: env]) model = HER('MlpPolicy', env, model_class, n_sampled_goal=3, goal_selection_strategy=goal_selection_strategy, verbose=0) model.learn(1000) model_predict(model, env, n_steps=100, additional_check=None) model.save('./test_her') del model # NOTE: HER does not support VecEnvWrapper yet with pytest.raises(AssertionError): model = HER.load('./test_her', env=VecNormalize(env)) model = HER.load('./test_her') # Check that the model raises an error when the env # is not wrapped (or no env passed to the model) with pytest.raises(ValueError): model.predict(env.reset()) env_ = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) env_ = HERGoalEnvWrapper(env_) model_predict(model, env_, n_steps=100, additional_check=None) model.set_env(env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 del model env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) model = HER.load('./test_her', env=env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 if os.path.isfile('./test_her.pkl'): os.remove('./test_her.pkl')
SAC, n_sampled_goal=4, goal_selection_strategy='future', verbose=1, buffer_size=int(1e6), learning_rate=0.001, gamma=0.95, batch_size=256, ent_coef='auto', random_exploration=0.3, learning_starts=1000, train_freq=1, policy_kwargs=dict(layers=[256, 256, 256]), tensorboard_log="./OpenAI/") # Train the model model.learn(int(8e6)) model.save("./model2") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load('./model2', env=env) obs = env.reset() episodes = 0 successes = 0 step = 0 while (episodes < 50): step += 1 action, _ = model.predict(obs) obs, reward, done, _ = env.step(action)
env, _ = load_env(env_name, core_dir=core_dir, envs_dir=envs_dir, xmls_dir=xmls_dir, return_args_remaining=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1) # Train the model model.learn(1000) model.save("./her_bit_env") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load('./her_bit_env', env=env) obs = env.reset() for _ in range(100): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) if done: obs = env.reset()
import gym import highway_env # Agent from stable_baselines import HER, SAC """## Training""" env = gym.make("parking-v0") model = HER('MlpPolicy', env, SAC, n_sampled_goal=4, goal_selection_strategy='future', verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.9, batch_size=256, policy_kwargs=dict(layers=[256, 256, 256])) model.learn(int(5e4)) """## Visualize a few episodes We first define a simple helper function for visualization of episodes: """ # !pip install gym pyvirtualdisplay # !apt-get install -y xvfb python-opengl ffmpeg from IPython import display as ipythondisplay from pyvirtualdisplay import Display from gym.wrappers import Monitor from pathlib import Path import base64 from tqdm.notebook import trange
# to match the her format. env = GoalWrapper(env, crop_obs=True) print('setting up model') model = HER('MlpPolicy', env, SAC, n_sampled_goal=4, goal_selection_strategy='future', verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95, batch_size=256) print('start learning') model.learn(total_timesteps=256) print('learning done') #Here we need to restart the environent to make rendering possible #(doesn't work with the wrappers right now) env = REALRobotEnv(objects=1) env = GoalWrapper(env, crop_obs=True) env.render("human") print('display model') observation = env.reset() action = env.action_space.sample() reward, done = 0, False for t in range(100): model_action, _ = model.predict(observation)
goal_selection_strategy=args.goal_selection_strategy, verbose=1, exploration_fraction=args.exploration_fraction, tensorboard_log=args.tensorboard_log_path + '/' + args.name) else: model = DQN(MlpPolicy, env, verbose=1, tensorboard_log='/srv/share/nkannabiran3/DQN/', double_q=True, prioritized_replay=True, prioritized_replay_alpha=0.8, prioritized_replay_beta0=0.2) print('learning') os.mkdir(args.tensorboard_log_path + '/' + args.name) parser.save_args() model.learn(total_timesteps=args.num_training_steps, tb_log_name=args.tensorboard_log_path + '/' + args.name) model.save(args.name) # del model # remove to demonstrate saving and loading # model = DQN.load("deepq_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if dones: break # env.render()
'param_noise': None, 'action_noise': action_noise, 'normalize_observations': normalize, 'nb_train_steps': nb_train_steps, 'nb_rollout_steps': nb_rollout_steps, 'batch_size': batch_size, 'critic_l2_reg': critic_l2_reg, 'buffer_size': buffer_size, 'random_exploration': random_exploration, 'policy_kwargs': { 'layer_norm': True }, 'logging': suff } model = HER('MlpPolicy', env, DDPG, **kwargs) start = time.time() model.learn(total_timesteps=total_timesteps, log_interval=1) if log: model.save("pkl/{}".format(suff)) print( "Saved as {0}, trained {1} primitive policy for {2} timesteps in {3}". format(suff, policy, total_timesteps, time.strftime('%H:%M:%S', time.gmtime(time.time() - start)))) else: print("Trained {0} primitive policy for {1} timesteps in {2}".format( policy, total_timesteps, time.strftime('%H:%M:%S', time.gmtime(time.time() - start))))
batch_size=256, policy_kwargs=dict(layers=[256, 256, 256])) # DDPG Hyperparams: # NOTE: it works even without action noise # n_actions = env.action_space.shape[0] # noise_std = 0.2 # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) # model = HER('MlpPolicy', env, DDPG, n_sampled_goal=n_sampled_goal, # goal_selection_strategy='future', # verbose=1, buffer_size=int(1e6), # actor_lr=1e-3, critic_lr=1e-3, action_noise=action_noise, # gamma=0.95, batch_size=256, # policy_kwargs=dict(layers=[256, 256, 256])) model.learn(int(2e5)) model.save('her_sac_highway') # Load saved model model = HER.load('her_sac_highway', env=env) obs = env.reset() # Evaluate the agent episode_reward = 0 for _ in range(100): action, _ = model.predict(obs) obs, reward, done, info = env.step(action) env.render() episode_reward += reward if done or info.get('is_success', False):
env = DummyVecEnv([ lambda: env ]) # The algorithms require a vectorized environment to run num_env = 2 #env = SubprocVecEnv([make_env(env_id, log_dir, i+worker_id) for i in range(num_env)]) model_class = DQN # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1) model.learn(total_timesteps=1000) model.save(log_dir + "model") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load(log_dir + "model", env=env) #evaluate agent episodes = 100 ep_r = [] ep_l = [] for e in range(episodes): obs = env.reset() total_r = 0. total_l = 0. while True:
env = gym.make('PointMass-%d-v1' % num_objs) n_actions = env.action_space.shape[-1] stddev = 0.2 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.01 * np.ones(n_actions)) policy = 'MlpPolicy' args_alg = dict( random_exploration=0.2, buffer_size=int(1E6), batch_size=256, nb_eval_steps=10, action_noise=action_noise, tensorboard_log=logger, ) model = HER(policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, **args_alg) model.learn(int(nIter)) model.save(expDir + "/%s" % np.format_float_scientific(nIter)) #model = HER.load("point1_deter", env=env) record_her_indep(env, model, expDir, num_files=10, video_len=500)
os.makedirs(model_path, exist_ok=True) set_global_seeds(0) num_of_active_envs = 1 policy_kwargs = dict(layer=[256, 256]) #env = gym.make("real_robot_challenge_phase_1-v1") env = FlatObservationWrapper( ExamplePushingTrainingEnv(frameskip=20, visualization=False)) train_configs = { "gamma": 0.99, "n_steps": int(120000 / 20), "ent_coef": 0.01, "learning_rate": 0.00025, "vf_coef": 0.5, "max_grad_norm": 0.5, "nminibatches": 40, "noptepochs": 4, } model = HER(MlpPolicy, env, SAC, verbose=1, tensorboard_log=model_path) ckpt_frequency = int(validate_every_timesteps / num_of_active_envs) checkpoint_callback = CheckpointCallback(save_freq=ckpt_frequency, save_path=model_path, name_prefix="model") model.learn(int(total_time_steps), callback=checkpoint_callback) env.close()
import gym import time from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2, DQN, HER, DDPG import synergyenvs env = gym.make("GraspBoxPybullet-v0") env.render() o = env.reset() # model = PPO2(MlpPolicy, env, verbose=1) model = HER('MlpPolicy', env, DDPG, n_sampled_goal=4, verbose=1) model.learn(50000) model.save("./her_graspbox-1") env.camera_adjust() for _ in range(1000): o = env.reset() env.render() action, _states = model.predict(o) # action = env.action_space.sample() o, r, done, info = env.step(action) print(o, r, done, info) if done: o = env.reset() time.sleep(0.2)
def launchAgent(): import Reinforcement_AI.env.d_image_env as image_env from stable_baselines import DQN, HER, DDPG, PPO2 from stable_baselines.common import make_vec_env model_name = "PPO2" if model_name == "HER": model = HER( "CnnPolicy", env=image_env.DetailedMiniMapEnv(), model_class=DQN ) if model_name == "DDPG": model = DDPG( policy="CnnPolicy", env=image_env.DDPGImageEnv(), normalize_observations=True ) if model_name == "PPO2": # env = image_env.DetailedMiniMapEnv() env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1) model = PPO2( policy="CnnPolicy", env=env, verbose=1 ) else: model = DQN( "CnnPolicy", # policy env=image_env.DetailedMiniMapEnv(), # environment double_q=True, # Double Q enable prioritized_replay=True, # Replay buffer enabled verbose=0 # log print ) for i in range(1000): if i != 0: if model_name == "HER": model = HER.load("detailedmap_HER_" + str(i)) model.set_env(image_env.DetailedMiniMapEnv()) if model_name == "DDPG": model = DDPG.load("detailedmap_DDPG_" + str(i)) model.set_env(image_env.DDPGImageEnv()) if model_name == "PPO2": # print('set env') # ppo2_env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1) # print('get model') model = PPO2.load("detailedmap_PPO2_" + str(i), env) # print('set model env') # model.set_env(ppo2_env) else: model = DQN.load("detailedmap_DQN_" + str(i)) model.set_env(image_env.DetailedMiniMapEnv()) # print('model learn start') model.learn(total_timesteps=3900) # print('model learn finished') # print('model save start') model.save("detailedmap_" + model_name + "_" + str(i+1)) del model
def callback(_locals, _globals): global n_steps n_steps += 1 if n_steps % 50000 == 0 or n_steps == 10000: print('Saving: ', n_steps) save_path = 'checkpoints/yumi/her/her_{}_task_{}_{}.npy'.format( name, args.task, n_steps) os.makedirs(os.path.dirname(save_path), exist_ok=True) model.save(save_path) return True model = HER('MlpPolicy', env, model_class=DDPG, verbose=1, tensorboard_log=log_dir, **dict(random_exploration=.2)) model.learn(total_timesteps=total_timesteps, callback=callback) model.save("her-yumi-{}-final".format(n_steps)) env.save_running_average(log_dir) obs = env.reset() for i in range(100): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()