print("Loading pretrained agent") # Policy should not be changed del hyperparams['policy'] model = ALGOS[args.algo].load(args.trained_agent, env=env, tensorboard_log=tensorboard_log, verbose=args.verbose, **hyperparams) exp_folder = args.trained_agent[:-4] if normalize: print("Loading saved running average") stats_path = os.path.join(exp_folder, env_id) if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')): env = VecNormalize.load( os.path.join(stats_path, 'vecnormalize.pkl'), env) else: # Legacy: env.load_running_average(exp_folder) elif args.optimize_hyperparameters: if args.verbose > 0: print("Optimizing hyperparameters") def create_model(*_args, **kwargs): """ Helper to create a model with different hyperparameters """ return ALGOS[args.algo](env=create_env(n_envs, no_log=True), tensorboard_log=tensorboard_log,
from stable_baselines.common.policies import MlpPolicy from stable_baselines.common import make_vec_env from stable_baselines import PPO2 from LearningRocket import LearningRocket import matplotlib.pyplot as plt from stable_baselines.common.vec_env import VecNormalize import numpy as np # multiprocess environment env = make_vec_env(LearningRocket, n_envs=16) eval_env = make_vec_env(lambda: LearningRocket(visualize=True), n_envs=1) #env = VecNormalize(env) #eval_env = VecNormalize(eval_env) env = VecNormalize.load("doof_env", env) eval_env = VecNormalize.load("doof_env", eval_env) eval_callback = EvalCallback(eval_env, best_model_save_path='Agent007', log_path='./logs/', eval_freq=10000, deterministic=True, render=False, n_eval_episodes=1) #model = PPO2(MlpPolicy, env, n_steps=1000, nminibatches=32, lam=0.98, gamma=0.999, learning_rate=1e-4, # noptepochs=4,ent_coef=0.01,verbose=1, tensorboard_log="./rocket_tensorboard/", # policy_kwargs = dict(layers=[400, 300])) """model = PPO2(MlpPolicy, env,verbose=1, tensorboard_log="./rocket_tensorboard/", policy_kwargs = dict(layers=[400, 300]))"""
def eval_policy( _run, _seed: int, env_name: str, eval_n_timesteps: Optional[int], eval_n_episodes: Optional[int], num_vec: int, parallel: bool, render: bool, render_fps: int, log_dir: str, policy_type: str, policy_path: str, reward_type: Optional[str] = None, reward_path: Optional[str] = None, max_episode_steps: Optional[int] = None, ): """Rolls a policy out in an environment, collecting statistics. Args: _seed: generated by Sacred. env_name: Gym environment identifier. eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. num_vec: Number of environments to run simultaneously. parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise, uses `DummyVecEnv`. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. render: If True, renders interactively to the screen. log_dir: The directory to log intermediate output to. (As of 2019-07-19 this is just episode-by-episode reward from bench.Monitor.) policy_type: A unique identifier for the saved policy, defined in POLICY_CLASSES. policy_path: A path to the serialized policy. reward_type: If specified, overrides the environment reward with a reward of this. reward_path: If reward_type is specified, the path to a serialized reward of `reward_type` to override the environment reward with. Returns: Return value of `imitation.util.rollout.rollout_stats()`. """ os.makedirs(log_dir, exist_ok=True) sacred_util.build_sacred_symlink(log_dir, _run) tf.logging.set_verbosity(tf.logging.INFO) tf.logging.info('Logging to %s', log_dir) sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) venv = VecNormalize(venv, training=False, norm_reward=False) venv = venv.load(policy_path + "/vec_normalize.pkl", venv) if render: venv = InteractiveRender(venv, render_fps) # TODO(adam): add support for videos using VideoRecorder? with contextlib.ExitStack() as stack: if reward_type is not None: reward_fn_ctx = load_reward(reward_type, reward_path, venv) reward_fn = stack.enter_context(reward_fn_ctx) venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn) tf.logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") with serialize.load_policy(policy_type, policy_path, venv) as policy: trajs = rollout.generate_trajectories(policy, venv, sample_until) return rollout.rollout_stats(trajs)
def normalize_env( env, orig_log_dir, sb_version, vectorize=True, continue_learning=False, evaluate=False, evaluate_during_learning=False, normalize_kwargs=None, ): if vectorize: env = DummyVecEnv([lambda: env]) logger.debug("Normalize: {}".format(normalize_kwargs)) if evaluate: # FIXME in continue learning training should be True so that we update the running average of obs and # rewards with new samples; if I do that, the algo performs very poorly even with no changes in the env if sb_version == "sb3": env = VecNormalize3(env, training=False, **normalize_kwargs) else: env = VecNormalize(env, training=False, **normalize_kwargs) if not evaluate_during_learning or continue_learning: if not os.path.exists( os.path.join(orig_log_dir, "vecnormalize.pkl")): env_name = get_env_name(env=env.unwrapped, sb_version=sb_version) index_last_separator = orig_log_dir.rindex("/") new_orig_log_dir = os.path.join( orig_log_dir[0:index_last_separator], "logs_" + env_name) logger.debug( "{} does not exist. Trying to search it in the original model directory {}" .format(os.path.join(orig_log_dir, "vecnormalize.pkl"), new_orig_log_dir)) assert os.path.exists(new_orig_log_dir), "{} does not exist" assert os.path.exists( os.path.join(new_orig_log_dir, "vecnormalize.pkl")), ( os.path.join(new_orig_log_dir, "vecnormalize.pkl") + " does not exist") logger.debug("[evaluate] Loading {}".format( os.path.join(new_orig_log_dir, "vecnormalize.pkl"))) if sb_version == "sb3": env = VecNormalize3.load( os.path.join(new_orig_log_dir, "vecnormalize.pkl"), env) else: env = VecNormalize.load( os.path.join(new_orig_log_dir, "vecnormalize.pkl"), env) else: logger.debug("[evaluate] Loading {}".format( os.path.join(orig_log_dir, "vecnormalize.pkl"))) if sb_version == "sb3": env = VecNormalize3.load( os.path.join(orig_log_dir, "vecnormalize.pkl"), env) else: env = VecNormalize.load( os.path.join(orig_log_dir, "vecnormalize.pkl"), env) # Deactivate training and reward normalization env.training = False env.norm_reward = False elif continue_learning: # FIXME: don't know why but during continue learning I have to disable training otherwise performance # is not the same as in the model trained from scratch even without changing the params of the environment. # in rl-baselines-zoo this is not done during continue learning: # https://github.com/araffin/rl-baselines-zoo/blob/master/train.py#L365 if sb_version == "sb3": env = VecNormalize3(env, training=False, **normalize_kwargs) else: env = VecNormalize(env, training=False, **normalize_kwargs) assert os.path.exists(os.path.join( orig_log_dir, "vecnormalize.pkl")), ( os.path.join(orig_log_dir, "vecnormalize.pkl") + " does not exist") logger.debug("[continue_learning] Loading {}".format( os.path.join(orig_log_dir, "vecnormalize.pkl"))) if sb_version == "sb3": env = VecNormalize3.load( os.path.join(orig_log_dir, "vecnormalize.pkl"), env) else: env = VecNormalize.load( os.path.join(orig_log_dir, "vecnormalize.pkl"), env) else: if sb_version == "sb3": env = VecNormalize3(env, **normalize_kwargs) else: env = VecNormalize(env, **normalize_kwargs) return env
h_shape = (10, 10, 1024) # Shape of the hidden state of the lstm network history_shape = 20 # Number of past actions to be tracked env = InattEnv(args, h_shape=h_shape, history_shape=history_shape, is_test=True, dynamic_gamma=dynamic_lambda_0) if dynamic_lambda_0: print("INFO: Gamma has been set to " + str(args.lambda_0)) env.lambda_0 = args.lambda_0 env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run env = VecNormalize(env, norm_obs=True, norm_reward=False) # Create RL policy policy = PPO2.load(os.path.join(args.rl_path, "best_agent.zip"), env=env, verbose=1) if args.normalize_env: if os.path.exists(os.path.join(args.rl_path, 'vecnormalize.pkl')): env = VecNormalize.load(os.path.join(args.rl_path, 'vecnormalize.pkl'), env) env.training = False env.norm_reward = False else: raise Exception("Normalization parameters not found") print("INFO: Loaded model " + os.path.join(args.rl_path, "best_agent.zip")) else: raise Exception("Policy type not recognized") # Active for plotting images plot_image = True if plot_image: def on_trackbar(val): if args.policy == 'baseline':
def create_test_env(env_id, n_envs=1, is_atari=False, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None, env_kwargs=None): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param is_atari: (bool) :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original env with :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :param env_kwargs: (Dict[str, Any]) Optional keyword argument to pass to the env constructor :return: (gym.Env) """ # HACK to save logs if log_dir is not None: os.environ["OPENAI_LOG_FORMAT"] = 'csv' os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) os.makedirs(log_dir, exist_ok=True) logger.configure() if hyperparams is None: hyperparams = {} if env_kwargs is None: env_kwargs = {} # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv([ make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: # HACK: force SubprocVecEnv for Bullet env env = SubprocVecEnv([ make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) else: env = DummyVecEnv([ make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')): env = VecNormalize.load( os.path.join(stats_path, 'vecnormalize.pkl'), env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: # Legacy: env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
def create_test_env(env_id, n_envs=1, is_atari=False, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param is_atari: (bool) :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original env with :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :return: (gym.Env) """ # HACK to save logs if log_dir is not None: os.environ["OPENAI_LOG_FORMAT"] = 'csv' os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) os.makedirs(log_dir, exist_ok=True) logger.configure() if hyperparams is None: hyperparams = {} # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv([ make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper) for i in range(n_envs) ]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: spec = gym.envs.registry.env_specs[env_id] try: class_ = load(spec.entry_point) except AttributeError: # Backward compatibility with gym class_ = load(spec._entry_point) # HACK: force SubprocVecEnv for Bullet env that does not # have a render argument render_name = None use_subproc = 'renders' not in inspect.getfullargspec( class_.__init__).args if not use_subproc: render_name = 'renders' # Dev branch of pybullet # use_subproc = use_subproc and 'render' not in inspect.getfullargspec(class_.__init__).args # if not use_subproc and render_name is None: # render_name = 'render' # Create the env, with the original kwargs, and the new ones overriding them if needed def _init(): # TODO: fix for pybullet locomotion envs env = class_(**{**spec._kwargs}, **{render_name: should_render}) env.seed(0) if log_dir is not None: env = Monitor(env, os.path.join(log_dir, "0"), allow_early_resets=True) return env if use_subproc: env = SubprocVecEnv([ make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper) ]) else: env = DummyVecEnv([_init]) else: env = DummyVecEnv( [make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')): env = VecNormalize.load( os.path.join(stats_path, 'vecnormalize.pkl'), env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: # Legacy: env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
def __init__(self, algorithm="SAC", load=True, agent_name="Agent001"): self.agent_name = agent_name #self.env = LearningRocket(visualize=False) #self.env = NormalizeActionWrapper(self.env) #self.eval_env = LearningRocket(visualize=True) #self.eval_env = NormalizeActionWrapper(self.eval_env) #self.env = SubprocVecEnv([lambda: LearningRocket(visualize=False) for i in range(4)]) self.env = make_vec_env( LearningRocket, n_envs=16 ) #[lambda: LearningRocket(visualize=False) for i in range(16)])) #self.eval_env = VecNormalize(DummyVecEnv([lambda: LearningRocket(visualize=True) for i in range(1)])) self.eval_env = make_vec_env(lambda: LearningRocket(visualize=True), n_envs=1) #self.eval_env = VecNormalize(self.eval_env) self.eval_callback = EvalCallback(self.eval_env, best_model_save_path='Agent007', log_path='./logs/', eval_freq=10000, deterministic=True, render=False, n_eval_episodes=1) kai_policy = dict(act_fun=tf.nn.tanh, net_arch=[400, 300]) #check_env(self.env, warn=True) """ if algorithm == "SAC": if load is True: self.model = SAC.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") #self.model.ent_coef=0.2 else: self.model = SAC('MlpPolicy', self.env, verbose=1, tensorboard_log="./rocket_tensorboard/",ent_coef=5) print("Trainer Set for SAC") """ if algorithm == "TD3": n_actions = self.env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) if load is True: self.model = TD3.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") #file = open('replay_buffer', 'rb') #self.model.replay_buffer = pickle.load(file) #file.close() else: self.model = TD3(MlpPolicy, self.env, action_noise=action_noise, batch_size=768, gamma=0.95, learning_rate=1e-4, learning_starts=20000, verbose=1, tensorboard_log="./rocket_tensorboard/", policy_kwargs=dict(layers=[400, 300])) print("Trainer Set for TD3") elif algorithm == "PPO2": if load is True: self.model = PPO2.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") self.eval_env = VecNormalize.load(self.agent_name + "vEnv", self.eval_env) #self.eval_env.clip_obs = 500 #self.env = VecNormalize(self.env) self.env = VecNormalize.load(self.agent_name + "vEnv", self.env) #self.env.clip_obs = 500 #self.env.norm_obs = False #self.eval_env.norm_obs = False else: self.model = PPO2(PPOMlpPolicy, self.env, n_steps=1024, nminibatches=32, lam=0.98, gamma=0.999, noptepochs=4, ent_coef=0.01, verbose=1, tensorboard_log="./rocket_tensorboard/", policy_kwargs=dict(layers=[400, 300])) self.eval_env = VecNormalize(self.eval_env) self.env = VecNormalize(self.env) #self.eval_env.clip_obs = 500 #self.env.clip_obs = 500 #self.env.norm_obs=False #self.eval_env.norm_obs=False print("Trainer set for PPO2. I am speed.")
def main(): try: difficulty = int(sys.argv[1]) initial_pose_json = sys.argv[2] goal_pose_json = sys.argv[3] output_file = sys.argv[4] except IndexError: print("Incorrect number of arguments.") print("Usage:\n" "\tevaluate_policy.py <difficulty_level> <initial_pose>" " <goal_pose> <output_file>") sys.exit(1) # the poses are passes as JSON strings, so they need to be converted first initial_pose = move_cube.Pose.from_json(initial_pose_json) goal_pose = move_cube.Pose.from_json(goal_pose_json) # create a FixedInitializer with the given values initializer = FixedInitializer(difficulty, initial_pose, goal_pose) # TODO: Replace with your environment if you used a custom one. env = CubeEnv(frameskip=5, visualization=False, initializer=initializer, action_type=ActionType.POSITION, observation_type=ObservationType.WITHOUT_GOALS, testing=True) env = FrameStackWrapper(TimeFeatureWrapper(FlatObservationWrapper(env)), 4) #env = TimeFeatureWrapper(FlatObservationWrapper(env)) norm_env = VecNormalize.load("models/normalized_env_frame_stacked_model", DummyVecEnv([lambda: env])) if difficulty == 1: policy = SAC.load( "models/checkpoint_saves/CONTINUE_SAC_09_19_2020_01_55_01__1000000_steps.zip" ) elif difficulty == 2: policy = SAC.load( "models/checkpoint_saves/CONTINUE_SAC_09_19_2020_01_55_06__1000000_steps.zip" ) elif difficulty == 3: policy = SAC.load( "models/checkpoint_saves/CONTINUE_SAC_09_19_2020_01_55_14__1000000_steps.zip" ) elif difficulty == 4: policy = SAC.load( "models/checkpoint_saves/CONTINUE_SAC_09_19_2020_01_55_18__1000000_steps.zip" ) # Execute one episode. Make sure that the number of simulation steps # matches with the episode length of the task. When using the default Gym # environment, this is the case when looping until is_done == True. Make # sure to adjust this in case your custom environment behaves differently! is_done = False observation = env.reset() accumulated_reward = 0 while not is_done: action, _ = policy.predict(np.expand_dims( norm_env.normalize_obs(observation), axis=0), deterministic=True) observation, reward, is_done, info = env.step(action[0]) accumulated_reward += reward print("Accumulated reward: {}".format(accumulated_reward)) # store the log for evaluation env.platform.store_action_log(output_file)
from stable_baselines.common.policies import MlpPolicy from stable_baselines.common import make_vec_env from stable_baselines import PPO2 from TestHover.LearningRocketHover import LearningRocket import matplotlib.pyplot as plt from stable_baselines.common.vec_env import VecNormalize import numpy as np # multiprocess environment env = make_vec_env(LearningRocket, n_envs=16) eval_env = make_vec_env(lambda: LearningRocket(visualize=True), n_envs=1) #env = VecNormalize(env) #eval_env = VecNormalize(eval_env) env = VecNormalize.load("TestHover_env", env) eval_env = VecNormalize.load("TestHover_env", eval_env) eval_callback = EvalCallback(eval_env, best_model_save_path='Agent007', log_path='./logs/', eval_freq=10000, deterministic=True, render=False, n_eval_episodes=1) model = PPO2(MlpPolicy, env, n_steps=1000, nminibatches=32, lam=0.98,
return _init if __name__ == '__main__': log_dir = 'models/hover/empty_world_small/finalVec' stats_path = os.path.join(log_dir, "vec_normalize.pkl") env_id = 'CrazyflieObstacleEval-v0' # Load the agent model = PPO2.load(log_dir + '/ppo2_final') # Load the saved statistics env = DummyVecEnv([ lambda: gym.make(env_id, n_obstacles=1, avoidance_method='Heuristic') ]) env = VecNormalize.load(stats_path, env) # do not update them at test time env.training = False # reward normalization is not needed at test time env.norm_reward = False eval_episodes = 50 total_goals_reached = 0 total_collisions = 0 total_flips = 0 total_steps_exceeded = 0 total_potential_collisions = 0 total_collisions_avoided = 0 total_timsteps = 0
def learn(self): # Use deterministic actions for evaluation eval_path = self.model_dir + "/best_model" # TODO save checkpoints with vecnormalize callback pkl file save_vec_normalize = SaveVecNormalizeCallback(save_freq=1, save_path=eval_path) if self.norm: # Don't normalize the reward for test env self.test_env = VecNormalize(self.test_env, norm_obs=True, norm_reward=False, clip_obs=10.) eval_callback = EvalCallback(self.test_env, best_model_save_path=eval_path, log_path=eval_path+'/logs', eval_freq=50000, n_eval_episodes=10, callback_on_new_best=save_vec_normalize, deterministic=True, render=False) checkpoint_callback = CheckpointCallback(save_freq=25000, save_path=self.model_dir+'/logs/', name_prefix='rl_model') time_callback = TrainingTimeCallback() tensorboard_file = None if self.config[self.algo]['tensorboard_logs'] is None else "tensorboard_logs/"+self.model_dir if self.algo == 'SAC': if not self.env.envs[0].is_simplified() and (self.env.envs[0].depth_obs or self.env.envs[0].full_obs): policy_kwargs = { "layers": self.config[self.algo]['layers'], "cnn_extractor": custom_obs_policy.create_augmented_nature_cnn(1)} policy = sacCnn elif self.env.envs[0].depth_obs or self.env.envs[0].full_obs: policy_kwargs = {} policy = sacCnn else: policy_kwargs = {"layers": self.config[self.algo]['layers'], "layer_norm": False} policy = sacMlp if self.load_dir: top_folder_idx = self.load_dir.rfind('/') top_folder_str = self.load_dir[0:top_folder_idx] if self.norm: self.env = VecNormalize(self.env, training=True, norm_obs=False, norm_reward=False, clip_obs=10.) self.env = VecNormalize.load(os.path.join(top_folder_str, 'vecnormalize.pkl'), self.env) model = sb.SAC(policy, self.env, policy_kwargs=policy_kwargs, verbose=1, gamma=self.config['discount_factor'], buffer_size=self.config[self.algo]['buffer_size'], batch_size=self.config[self.algo]['batch_size'], learning_rate=self.config[self.algo]['step_size'], tensorboard_log=tensorboard_file) model_load = sb.SAC.load(self.load_dir, self.env) params = model_load.get_parameters() model.load_parameters(params, exact_match=False) else: if self.norm: self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True, clip_obs=10.) model = sb.SAC(policy, self.env, policy_kwargs=policy_kwargs, verbose=2, gamma=self.config['discount_factor'], buffer_size=self.config[self.algo]['buffer_size'], batch_size=self.config[self.algo]['batch_size'], learning_rate=self.config[self.algo]['step_size'], tensorboard_log=tensorboard_file) elif self.algo == 'TRPO': model = sb.TRPO(MlpPolicy, self.env, verbose=2, gamma=self.config['discount_factor'], timesteps_per_batch=self.config[self.algo]['max_iters'], vf_stepsize=self.config[self.algo]['step_size'], tensorboard_log=tensorboard_file) elif self.algo == 'PPO': if not self.env.envs[0].is_simplified() and (self.env.envs[0].depth_obs or self.env.envs[0].full_obs): policy_kwargs = { "layers": self.config[self.algo]['layers'], "cnn_extractor": custom_obs_policy.create_augmented_nature_cnn(1)} policy = CnnPolicy elif self.env.envs[0].depth_obs or self.env.envs[0].full_obs: policy_kwargs = {} policy = CnnPolicy else: policy_kwargs = {"layers": self.config[self.algo]['layers'], "layer_norm": False} policy = MlpPolicy model = sb.PPO2(MlpPolicy, self.env, verbose=2, gamma=self.config['discount_factor'], learning_rate=self.config[self.algo]['learning_rate'], tensorboard_log=tensorboard_file) elif self.algo == 'DQN': if self.load_dir: model = self.load_params() else: model = sb.DQN(DQNMlpPolicy, self.env, verbose=2, gamma=self.config['discount_factor'], batch_size=self.config[self.algo]['batch_size'], prioritized_replay=self.config[self.algo]['prioritized_replay'], tensorboard_log=tensorboard_file) elif self.algo == "DDPG": param_noise = AdaptiveParamNoiseSpec() model = sb.DDPG(ddpgMlp, self.env, verbose=2, gamma=self.config['discount_factor'], param_noise=param_noise, tensorboard_log=tensorboard_file) try: model.learn(total_timesteps=int(self.config[self.algo]['total_timesteps']), callback=[TensorboardCallback(self.env, tensorboard_file, self.algo, self.log_freq, self.model_dir), eval_callback]) except KeyboardInterrupt: pass self.save(model, self.model_dir)
def train(method="SAC"): def get_multi_process_env(num_of_envs, subprocess=True, amplitude_scaling=False, frameskip=5, with_goals=False, action_type=ActionType.POSITION, difficulty=1, initializer="random", testing=False): if initializer == "random": initializer = RandomInitializer(difficulty=difficulty) elif initializer == "completely_random": initializer = CompletelyRandomInitializer() def _make_env(rank): def _init(): obs_type = ObservationType.WITH_GOALS if with_goals else ObservationType.WITHOUT_GOALS out_env = CubeEnv(frameskip=frameskip, visualization=False, initializer=initializer, action_type=action_type, observation_type=obs_type, testing=testing) out_env.seed(seed=54321) out_env.action_space.seed(seed=54321) if not with_goals: out_env = FlatObservationWrapper( out_env, amplitude_scaling=amplitude_scaling) out_env = TimeFeatureWrapper(out_env, max_steps=math.ceil( 3750 / frameskip)) else: out_env = GoalObservationWrapper( out_env, amplitude_scaling=amplitude_scaling) return out_env return _init if subprocess: return SubprocVecEnv( [_make_env(rank=i) for i in range(num_of_envs)]) else: return DummyVecEnv([_make_env(rank=i) for i in range(num_of_envs)]) date_time_str = datetime.now().strftime("%m_%d_%Y_%H_%M_%S_") print(method, date_time_str) set_global_seeds(0) if method == "HER": env = get_multi_process_env(1, subprocess=False, amplitude_scaling=True, frameskip=5, with_goals=True) env.set_attr("reward_range", 1000) policy_kwargs = dict(layers=[128, 128], act_fun=tf.tanh) n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.2) * np.ones(n_actions)) model = HER("MlpPolicy", env, SAC, policy_kwargs=policy_kwargs, n_sampled_goal=4, goal_selection_strategy='future', verbose=1, tensorboard_log="tblogs", batch_size=512, buffer_size=100000, gamma=0.98, learning_starts=10000, random_exploration=0.15) model.learn(int(2e6), log_interval=10, callback=CheckpointCallback( save_freq=int(1e5), save_path='models/checkpoint_saves', name_prefix=method + '_' + date_time_str), tb_log_name=method + '_' + date_time_str) if method == "SAC": env = VecNormalize(VecFrameStack( get_multi_process_env(1, subprocess=False, amplitude_scaling=False, frameskip=5, action_type=ActionType.POSITION, difficulty=1, initializer="completely_random"), 4), norm_reward=False, clip_reward=1500, gamma=0.99) policy_kwargs = dict(layers=[256, 256]) n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.2) * np.ones(n_actions)) model = SAC("LnMlpPolicy", env, policy_kwargs=policy_kwargs, buffer_size=1000000, batch_size=256, gamma=0.99, learning_rate=LinearSchedule(int(2e6), 5e-5, initial_p=3e-4).value, train_freq=64, gradient_steps=4, tau=0.005, learning_starts=10000, tensorboard_log="tblogs", verbose=1, use_emph_exp=True, action_noise=action_noise) model.learn(int(2e6), log_interval=10, callback=CheckpointCallback( save_freq=int(5e5), save_path='models/checkpoint_saves', name_prefix=method + '_' + date_time_str), tb_log_name=method + '_' + date_time_str) env.save("normalized_env_" + date_time_str) if method == "CONTINUE_SAC": difficulty = 4 env = VecNormalize.load( "models/normalized_env_frame_stacked_model", VecFrameStack( get_multi_process_env(1, subprocess=False, amplitude_scaling=True, frameskip=5, action_type=ActionType.POSITION, difficulty=difficulty, initializer="random", testing=True), 4)) model = SAC.load( "models/checkpoint_saves/SAC_09_18_2020_19_07_42__1000000_steps.zip", env=env, tensorboard_log="tblogs", ) model.learn(int(1e6), log_interval=10, callback=CheckpointCallback( save_freq=int(5e5), save_path='models/checkpoint_saves', name_prefix=method + '_' + date_time_str), tb_log_name=method + '_' + date_time_str) env.save("normalized_env_difficulty_" + str(difficulty)) model.save( os.path.join('models', "model_difficulty_" + str(difficulty))) if method == "save_vec_env": env = VecNormalize( get_multi_process_env(1, subprocess=False, amplitude_scaling=True, frameskip=5, action_type=ActionType.POSITION, difficulty=1, initializer="completely_random")) model = SAC.load( "models/checkpoint_saves/SAC_09_18_2020_14_27_30__2000000_steps.zip", env=env) model.learn(int(1e5), log_interval=1) env.save("normalized_env_without_framestack") return else: return print("save model: ", os.path.join('models', method + '_' + date_time_str))
def main(): # SIM_ENV_NAME = 'InvertedPendulum-v2' # REAL_ENV_NAME = 'InvertedPendulumModified-v2_old' # expt_label = input('Enter a label for the experiment : ') parser = argparse.ArgumentParser( description='Reinforced Grounded Action Transformation') parser.add_argument( '--target_policy_algo', default="TRPO", type=str, help="name in str of the agent policy training algorithm") parser.add_argument( '--action_tf_policy_algo', default="TRPO", type=str, help="name in str of the Action Transformer policy training algorithm") parser.add_argument( '--load_policy_path', default='data/models/TRPO_initial_policy_steps_Hopper-v2_1000000_.pkl', help="relative path of initial policy trained in sim") parser.add_argument('--alpha', default=1.0, type=float, help="Deprecated feature. Ignore") parser.add_argument('--beta', default=1.0, type=float, help="Deprecated feature. Ignore") parser.add_argument( '--n_trainsteps_target_policy', default=1000000, type=int, help= "Number of time steps to train the agent policy in the grounded environment" ) parser.add_argument( '--n_trainsteps_action_tf_policy', default=1000000, type=int, help= "Timesteps to train the Action Transformer policy in the ATPEnvironment" ) parser.add_argument( '--num_cores', default=10, type=int, help="Number of threads to use while collecting real world experience") parser.add_argument('--sim_env', default='InvertedPendulum-v2', help="Name of the simulator environment (Unmodified)") parser.add_argument('--real_env', default='InvertedPendulumModified-v2', help="Name of the Real World environment (Modified)") parser.add_argument( '--n_frames', default=1, type=int, help="Number of previous frames observed by discriminator") parser.add_argument( '--expt_number', default=1, type=int, help="Expt. number to keep track of multiple experiments") parser.add_argument( '--n_grounding_steps', default=1, type=int, help="Number of grounding steps. (Outerloop of algorithm ) ") parser.add_argument('--n_iters_atp', default=20, type=int, help="Number of GAN iterations") parser.add_argument('--discriminator_epochs', default=5, type=int, help="Discriminator epochs per GAN iteration") parser.add_argument('--generator_epochs', default=50, type=int, help="ATP epochs per GAN iteration") parser.add_argument('--real_trajs', default=100, type=int, help="Set max amount of real TRAJECTORIES used") parser.add_argument('--sim_trajs', default=100, type=int, help="Set max amount of sim TRAJECTORIES used") parser.add_argument('--real_trans', default=5000, type=int, help="amount of real world transitions used") parser.add_argument('--gsim_trans', default=5000, type=int, help="amount of simulator transitions used") parser.add_argument('--debug', action='store_true', help="DEPRECATED") parser.add_argument( '--eval', action='store_true', help= "set to true to evaluate the agent policy in the real environment, after training in grounded environment" ) parser.add_argument('--use_cuda', action='store_true', help="DEPRECATED. Not using CUDA") parser.add_argument('--instance_noise', action='store_true', help="DEPRECATED. Not using instance noise") parser.add_argument( '--ent_coeff', default=0.00005, type=float, help= "entropy coefficient for the PPO algorithm, used to train the action transformer policy" ) parser.add_argument( '--max_kl', default=0.000005, type=float, help="Set this only if using TRPO for the action transformer policy") parser.add_argument( '--clip_range', default=0.1, type=float, help="PPO objective clipping factor -> Action transformer policy") parser.add_argument('--use_condor', action='store_true', help="UNUSABLE") parser.add_argument( '--plot', action='store_true', help= "visualize the action transformer policy - works well only for simple environments" ) parser.add_argument('--tensorboard', action='store_true', help="visualize training in tensorboard") parser.add_argument('--save_atp', action='store_true', help="Saves the action transformer policy") parser.add_argument('--save_target_policy', action='store_true', help="saves the agent policy") parser.add_argument('--debug_discriminator', action='store_true', help="UNUSED") parser.add_argument('--use_eval_callback', action='store_true', help="UNUSED") parser.add_argument( '--loss_function', default="GAIL", type=str, help="choose from the list: ['GAIL', 'WGAN', 'AIRL', 'FAIRL']") parser.add_argument('--reset_disc_only', action='store_true', help="UNUSED") parser.add_argument('--namespace', default="wed_night", type=str, help="namespace for the experiments") parser.add_argument('--dont_reset', action='store_true', help="UNUSED") parser.add_argument('--reset_target_policy', action='store_true', help="UNUSED") parser.add_argument('--randomize_target_policy', action='store_true', help="UNUSED") parser.add_argument( '--compute_grad_penalty', action='store_true', help= "set this to true to compute the GP term while training the discriminator" ) parser.add_argument( '--single_batch_test', action='store_true', help="performs a single update of the generator and discriminator.") parser.add_argument('--folder_namespace', default="None", type=str, help="UNUSED") parser.add_argument( '--disc_lr', default=3e-3, type=float, help="learning rate for the AdamW optimizer to update the discriminator" ) parser.add_argument( '--atp_lr', default=3e-4, type=float, help="learning rate for the Adam optimizer to update the agent policy") parser.add_argument( '--nminibatches', default=4, type=int, help= "Number of minibatches used by the PPO algorithm to update the action transformer policy" ) parser.add_argument( '--noptepochs', default=4, type=int, help= "Number of optimization epochs performed per minibatch by the PPO algorithm to update the action transformer policy" ) parser.add_argument( '--deterministic', default=0, type=int, help= "set to 0 to use the deterministic action transformer policy in the grounded environment" ) parser.add_argument('--single_batch_size', default=0, type=int, help="batch size for the GARAT update") args = parser.parse_args() # set the seeds here for experiments random.seed(args.expt_number) np.random.seed(args.expt_number) torch.manual_seed(args.expt_number) # if args.wgan: args.loss_function = 'WGAN' # make dummy gym environment dummy_env = gym.make(args.real_env) if args.dont_reset is True and args.reset_disc_only is True: raise ValueError( 'Cannot have both args dont_reset and reset_disc_only. Choose one.' ) expt_type = 'sim2sim' if args.sim_env == args.real_env else 'sim2real' expt_label = args.namespace + args.loss_function + '_' + expt_type + '_' + args.target_policy_algo + '_' + str( args.n_trainsteps_target_policy) + '_' + str( args.real_trans) + '_' + str(args.n_iters_atp) + '_' + str( args.expt_number) # create the experiment folder if args.use_condor: if args.folder_namespace is "None": expt_path = '/u/' + args.real_env + '/' + expt_label else: expt_path = '/u/' + args.folder_namespace + '/' + expt_label else: expt_path = 'data/models/garat/' + expt_label expt_already_running = False gatworld = ReinforcedGAT( load_policy=args.load_policy_path, num_cores=args.num_cores, sim_env_name=args.sim_env, real_env_name=args.real_env, expt_label=expt_label, frames=args.n_frames, algo=args.target_policy_algo, atp_algo=args.action_tf_policy_algo, debug=args.debug, real_trajs=args.real_trajs, sim_trajs=args.sim_trajs, use_cuda=args.use_cuda, real_trans=args.real_trans, gsim_trans=args.gsim_trans, expt_path=expt_path, tensorboard=args.tensorboard, atp_loss_function=args.loss_function, single_batch_size=None if args.single_batch_size == 0 else args.single_batch_size, ) # checkpointing logic ~~ necessary when deploying script on Condor cluster if os.path.exists(expt_path): print('~~ Resuming from checkpoint ~~') # remove the best_model.zip file if it exists if os.path.exists(expt_path + '/best_model.zip'): os.remove(expt_path + '/best_model.zip') expt_already_running = True grounding_step = len(glob.glob(expt_path + '/*.pkl')) print('found ', grounding_step, ' target policies in disk') if grounding_step == args.n_grounding_steps: # training has ended raise ValueError('Rerunning same experiment again ! Exiting') else: if grounding_step > 0: print('reloading weights of the target policy') gatworld.load_model(expt_path + '/target_policy_' + str(grounding_step - 1) + '.pkl') else: print('First time running experiment') os.makedirs(expt_path) grounding_step = 0 with open(expt_path + '/commandline_args.txt', 'w') as f: f.write('\n'.join(sys.argv[1:])) start_grouding_step = grounding_step if args.reset_disc_only or args.dont_reset: cprint('~~ INITIALIZING DISCRIMINATOR AND ATP POLICY ~~', 'yellow') gatworld._init_rgat_models( algo=args.action_tf_policy_algo, ent_coeff=args.ent_coeff, max_kl=args.max_kl, clip_range=args.clip_range, atp_loss_function=args.loss_function, disc_lr=args.disc_lr, atp_lr=args.atp_lr, nminibatches=args.nminibatches, noptepochs=args.noptepochs, ) for _ in range(args.n_grounding_steps - start_grouding_step): grounding_step += 1 gatworld.collect_experience_from_real_env() cprint('~~ RESETTING DISCRIMINATOR AND ATP POLICY ~~', 'yellow') gatworld._init_rgat_models( algo=args.action_tf_policy_algo, ent_coeff=args.ent_coeff, max_kl=args.max_kl, clip_range=args.clip_range, atp_loss_function=args.loss_function, disc_lr=args.disc_lr, atp_lr=args.atp_lr, nminibatches=args.nminibatches, noptepochs=args.noptepochs, ) # ground the environment for ii in range(args.n_iters_atp): print('################### GROUNDING INNER ITERATION : ', ii, ' ###################') for _ in range(args.discriminator_epochs): gatworld.train_discriminator( iter_step=ii, grounding_step=grounding_step, num_epochs=args.noptepochs * 5 if ii <= 10 else args.noptepochs, # warmup inject_instance_noise=args.instance_noise, compute_grad_penalty=args.compute_grad_penalty, nminibatches=args.nminibatches, single_batch_test=args.single_batch_test, debug_discriminator=args.debug_discriminator, ) gatworld.train_action_transformer_policy( beta=args.beta, num_epochs=args.generator_epochs, loss_function=args.loss_function, single_batch_test=args.single_batch_test, ) # test grounded environment if args.plot and dummy_env.action_space.shape[0] < 5: # action transformer plot gatworld.test_grounded_environment( alpha=args.alpha, grounding_step=str(grounding_step) + '_' + str(ii), ) else: print('Environment has action space > 5. Skipping AT plotting') if args.save_atp: # save the action transformer policy for further analysis gatworld.save_atp(grounding_step=str(grounding_step) + '_' + str(ii)) # gatworld.save_grounded_env(grounding_step=str(grounding_step) + '_' + str(ii)) # if args.randomize_target_policy: # gatworld._randomize_target_policy(algo=args.target_policy_algo) gatworld.train_target_policy_in_grounded_env( grounding_step=grounding_step, alpha=args.alpha, time_steps=args.n_trainsteps_target_policy, use_eval_callback=args.use_eval_callback, save_model=args.save_target_policy, use_deterministic=True if args.deterministic == 1 else False, ) if args.eval: cprint('Evaluating target policy in environment .. ', 'red', 'on_blue') test_env = gym.make(args.real_env) if 'mujoco_norm' in args.load_policy_path: test_env = MujocoNormalized(test_env) elif 'normalized' in args.load_policy_path: test_env = DummyVecEnv([lambda: test_env]) test_env = VecNormalize.load('data/models/env_stats/' + args.sim_env + '.pkl', venv=test_env) # evaluate on the real world. try: val = evaluate_policy_on_env(test_env, gatworld.target_policy, render=False, iters=20, deterministic=True) with open(expt_path + "/output.txt", "a") as txt_file: print(val, file=txt_file) val = evaluate_policy_on_env(test_env, gatworld.target_policy, render=False, iters=20, deterministic=False) with open(expt_path + "/stochastic_output.txt", "a") as txt_file: print(val, file=txt_file) except Exception as e: cprint(e, 'red') # expt done, now get the green and red lines if args.eval: # green line cprint('**~~vv^^ GETTING GREEN AND RED LINES ^^vv~~**', 'red', 'on_green') test_env = gym.make(args.real_env) if 'mujoco_norm' in args.load_policy_path: test_env = MujocoNormalized(test_env) elif 'normalized' in args.load_policy_path: test_env = DummyVecEnv([lambda: test_env]) test_env = VecNormalize.load('data/models/env_stats/' + args.sim_env + '.pkl', venv=test_env) sim_policy = 'data/models/' + args.target_policy_algo + '_initial_policy_steps_' + args.sim_env + '_1000000_.pkl' real_policy = 'data/models/' + args.target_policy_algo + '_initial_policy_steps_' + args.real_env + '_1000000_.pkl' if 'HalfCheetah' in args.load_policy_path or 'Reacher' in args.load_policy_path: sim_policy = sim_policy.replace('1000000_.pkl', '2000000_.pkl') real_policy = real_policy.replace('1000000_.pkl', '2000000_.pkl') # if 'Walker2d' in args.load_policy_path: # sim_policy = sim_policy.replace('1000000_.pkl', '2000000_mujoco_norm_.pkl') # real_policy = real_policy.replace('1000000_.pkl', '2000000_mujoco_norm_.pkl') if 'mujoco_norm' in args.load_policy_path: sim_policy = sim_policy.replace('1000000_.pkl', '2000000_mujoco_norm_.pkl') real_policy = real_policy.replace('1000000_.pkl', '2000000_mujoco_norm_.pkl') elif 'normalized' in args.load_policy_path: sim_policy = sim_policy.replace('1000000_.pkl', '1000000_normalized_.pkl') real_policy = real_policy.replace('1000000_.pkl', '1000000_normalized_.pkl') if args.target_policy_algo == 'PPO2': algo = PPO2 elif args.target_policy_algo == 'TRPO': algo = TRPO val = evaluate_policy_on_env(test_env, algo.load(sim_policy), render=False, iters=10, deterministic=True) with open(expt_path + "/green_red.txt", "a") as txt_file: print(val, file=txt_file) # red line del algo # remove the old algo and reload it. if args.target_policy_algo == 'PPO2': algo = PPO2 elif args.target_policy_algo == 'TRPO': algo = TRPO val = evaluate_policy_on_env(test_env, algo.load(real_policy), render=False, iters=10, deterministic=True) with open(expt_path + "/green_red.txt", "a") as txt_file: print(val, file=txt_file) os._exit(0)