def create_test_env(env_id, n_envs=1, is_atari=False, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param is_atari: (bool) :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original env with :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :return: (gym.Env) """ # HACK to save logs if log_dir is not None: os.environ["OPENAI_LOG_FORMAT"] = 'csv' os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) os.makedirs(log_dir, exist_ok=True) logger.configure() # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] if is_atari: print("Using Atari wrapper") #env = make_atari_env(env_id, num_env=n_envs, seed=seed) ## Frame-stacking with 4 frames #env = VecFrameStack(env, n_stack=4) elif n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper) for i in range(n_envs)]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: spec = gym.envs.registry.env_specs[env_id] try: class_ = load(spec.entry_point) except AttributeError: # Backward compatibility with gym class_ = load(spec._entry_point) # HACK: force SubprocVecEnv for Bullet env that does not # have a render argument render_name = None use_subproc = 'renders' not in inspect.getfullargspec(class_.__init__).args if not use_subproc: render_name = 'renders' # Dev branch of pybullet # use_subproc = use_subproc and 'render' not in inspect.getfullargspec(class_.__init__).args # if not use_subproc and render_name is None: # render_name = 'render' # Create the env, with the original kwargs, and the new ones overriding them if needed def _init(): # TODO: fix for pybullet locomotion envs env = class_(**{**spec._kwargs}, **{render_name: should_render}) env.seed(0) if log_dir is not None: env = Monitor(env, os.path.join(log_dir, "0"), allow_early_resets=True) return env if use_subproc: env = SubprocVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) else: env = DummyVecEnv([_init]) else: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
actor_options['tensorboard_log'] = os.path.join( tensorboard_logdir, 'RemoteCenterPegInsertionNoHole-v0') print('\t--learning') model = PPO2(MlpPolicy, env, **actor_options) model.learn(**learning_options) save_path = os.path.join(actor_options['tensorboard_log'], 'model') running_average_path = actor_options['tensorboard_log'] model.save(save_path) env.save_running_average(running_average_path) finally: env.close() # Visualize the solution env = gym.make('RemoteCenterPegInsertionNoHole-v0') env = DummyVecEnv([lambda: env]) env = VecNormalize(env, training=False, norm_reward=False, clip_obs=np.inf, clip_reward=np.inf) env.load_running_average(running_average_path) obs = env.reset() while True: action, _states = model.predict(obs) clipped_action = np.clip(action, env.action_space.low, env.action_space.high) obs, rewards, dones, info = env.step(action) env.render() if dones[0]: env.reset()
def evaluate_model_on_set( set_path, model, config_path=None, config_kw=None, metrics=("success", "control_variation", "rise_time", "overshoot", "settling_time"), norm_data_path=None, num_envs=1, turbulence_intensity="none", use_pid=False, writer=None, timestep=None, ): """ :param set_path: (str) path to test set file :param model: (PPO2 object or [PIDController]) the controller to be evaluated :param config_path: (str) path to gym environment configuration file :param config_kw: (dict) dictionary of key value pairs to override settings in the configuration file of the gym environment :param metrics: ([str]) list of metrics to be computed and recorded :param norm_data_path: (str) path to folder containing normalization statistics :param num_envs: (int) number of gym environments to run in parallell using multiprocessing :param turbulence_intensity: (str) the intensity setting of the wind turbulence :param use_pid: (bool) Whether the evaluated controller is a PID controller or not :param writer: (tensorboard writer) If supplied, evaluation results will be written to tensorboard log, if not, results are printed to standard output :param timestep: (int) What timestep results are written to when using tensorboard logging :return: (dict) the metrics computed for the evaluated controller on the test set """ scenarios = list(np.load(set_path, allow_pickle=True)) scenario_count = len(scenarios) if config_kw is None: config_kw = {} config_kw.update({ "steps_max": 1500, "target": { "on_success": "done", "success_streak_fraction": 1, "success_streak_req": 100, "states": { 0: { "bound": 5 }, 1: { "bound": 5 }, 2: { "bound": 2 } }, }, }) if use_pid: config_kw["action"] = {"scale_space": False} sim_config_kw = { "turbulence": turbulence_intensity != "None", "turbulence_intensity": turbulence_intensity, } test_env = SubprocVecEnv([ make_env(config_path, i, config_kw=config_kw, sim_config_kw=sim_config_kw) for i in range(num_envs) ]) if use_pid: dt = test_env.get_attr("simulator")[0].dt for pid in model: pid.dt = dt env_cfg = test_env.get_attr("cfg")[0] obs_states = [var["name"] for var in env_cfg["observation"]["states"]] try: phi_i, theta_i, Va_i = ( obs_states.index("roll"), obs_states.index("pitch"), obs_states.index("Va"), ) omega_i = [ obs_states.index("omega_p"), obs_states.index("omega_q"), obs_states.index("omega_r"), ] except ValueError: print( "When using PID roll, pitch, Va, omega_p, omega_q, omega_r must be part of the observation vector." ) else: test_env = VecNormalize(test_env) if model.env is not None: test_env.obs_rms = model.env.obs_rms test_env.ret_rms = model.env.ret_rms else: assert norm_data_path is not None test_env.load_running_average(norm_data_path) test_env.training = False res = {metric: {} for metric in metrics} res["rewards"] = [[] for i in range(scenario_count)] active_envs = [i < scenario_count for i in range(num_envs)] env_scen_i = [i for i in range(num_envs)] test_done = False obs = np.array( [np.zeros(test_env.observation_space.shape) for i in range(num_envs)]) done = [True for i in range(num_envs)] info = None while not test_done: for i, env_done in enumerate(done): if env_done: if len(scenarios) > 0 or active_envs[i]: if len(scenarios) > 0: print("{}/{} scenarios left".format( len(scenarios), scenario_count)) scenario = scenarios.pop(0) env_scen_i[i] = (scenario_count - 1) - len(scenarios) obs[i] = test_env.env_method("reset", indices=i, **scenario)[0] if use_pid: model[i].reset() model[i].set_reference( scenario["target"]["roll"], scenario["target"]["pitch"], scenario["target"]["Va"], ) else: active_envs[i] = False if info is not None: for metric in metrics: if isinstance(info[i][metric], dict): for state, value in info[i][metric].items(): if state not in res[metric]: res[metric][state] = [] res[metric][state].append(value) else: if "all" not in res[metric]: res[metric]["all"] = [] res[metric]["all"].append(info[i][metric]) if len(scenarios) == 0: test_done = not any(active_envs) if use_pid: actions = [] for i, pid in enumerate(model): roll, pitch, Va = obs[i, phi_i], obs[i, theta_i], obs[i, Va_i] omega = obs[i, omega_i] if info is not None and "target" in info[i]: pid.set_reference( phi=info[i]["target"]["roll"], theta=info[i]["target"]["pitch"], va=info[i]["target"]["Va"], ) actions.append(pid.get_action(roll, pitch, Va, omega)) actions = np.array(actions) else: actions, _ = model.predict(obs, deterministic=True) obs, rew, done, info = test_env.step(actions) for i, env_rew in enumerate(rew): res["rewards"][env_scen_i[i]].append(env_rew) if writer is not None: summaries = [] for metric, metric_v in res.items(): if isinstance(res[metric], dict): for state, v in res[metric].items(): summaries.append( tf.Summary.Value( tag="test_set/{}_{}".format(metric, state), simple_value=np.nanmean(v), )) writer.add_summary(tf.Summary(value=summaries), timestep) else: print_results(res) return res
model = PPO2(CustomPolicy, env, n_steps=int(2048 / 128), nminibatches=64, noptepochs=10, lam=0.98, verbose=1, tensorboard_log='/home/xi/model/log') # model = PPO2.load("ppo2_ipadgame") # model.set_env(env) # model.tensorboard_log='/home/xi/model/log' # env.load_running_average("/home/xi/model/") model.learn(total_timesteps=50000) # model.save("ppo2_ipadgame") # env.save_running_average("/home/xi/model/") # print ('done') env = gym.make(env_id) env = DummyVecEnv([lambda: env]) env = VecNormalize(env) obs = env.reset() model = PPO2.load("ppo2_ipadgame") env.load_running_average("/home/xi/model/") for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()