def evaluate_policy(e, policy, learned_model, noise_level=0.0, real_step=False, num_episodes=10, visualize=False): # rollout the policy on env and record performance paths = [] for ep in range(num_episodes): e.reset() observations = [] actions = [] rewards = [] env_infos = [] t = 0 done = False while t < e.horizon and done is False: o = e.get_obs() ifo = e.get_env_infos() a = policy.get_action(o) if type(a) == list: a = a[1]['evaluation'] if noise_level > 0.0: a = a + e.env.env.np_random.uniform( low=-noise_level, high=noise_level, size=a.shape[0]) if real_step is False: next_s = learned_model.predict(o, a) r = 0.0 # temporarily e.env.env.set_fitted_state(next_s) else: next_o, r, done, ifo2 = e.step(a) ifo = ifo2 if ifo == {} else ifo if visualize: e.render() t = t + 1 observations.append(o) actions.append(a) rewards.append(r) env_infos.append(ifo) path = dict(observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), env_infos=tensor_utils.stack_tensor_dict_list(env_infos)) if real_step is False: e.env.env.compute_path_rewards(path) try: path = e.env.env.truncate_paths([path])[0] except: pass paths.append(path) if visualize: print("episode score = %f " % np.sum(path['rewards'])) return paths
def sample_paths( num_traj, env, policy, # mpc policy on fitted model horizon=1e6, eval_mode=True, base_seed=None, noise_level=0.1, ): # get the correct env behavior if type(env) == str: env = GymEnv(env) elif isinstance(env, GymEnv): env = env elif callable(env): env = env() else: print("Unsupported environment format") raise AttributeError if base_seed is not None: env.set_seed(base_seed) horizon = min(horizon, env.horizon) paths = [] for ep in range(num_traj): env.reset() observations = [] actions = [] rewards = [] env_infos = [] t = 0 done = False while t < horizon and done is False: obs = env.get_obs() ifo = env.get_env_infos() act = policy.get_action(obs) if eval_mode is False and type(act) != list: act = act + np.random.uniform( low=-noise_level, high=noise_level, size=act.shape[0]) if type(act) == list: act = act[0] if eval_mode is False else act[1]['evaluation'] next_obs, reward, done, _ = env.step(act) t = t + 1 observations.append(obs) actions.append(act) rewards.append(reward) env_infos.append(ifo) path = dict(observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), terminated=done, env_infos=tensor_utils.stack_tensor_dict_list(env_infos)) paths.append(path) return paths
def do_rollout(num_traj, env, policy, eval_mode=False, horizon=1e6, base_seed=None, env_kwargs=None, init_states_per_cpu=None): """ :param num_traj: number of trajectories (int) :param env: environment (env class, str with env_name, or factory function) :param policy: policy to use for action selection :param eval_mode: use evaluation mode for action computation (bool) :param horizon: max horizon length for rollout (<= env.horizon) :param base_seed: base seed for rollouts (int) :param env_kwargs: dictionary with parameters, will be passed to env generator :param init_states_per_cpu: list of init states to initialize from for fixed evaluation :return: """ # get the correct env behavior if type(env) == str: if isinstance(env_kwargs, dict): env = GymEnv(env, **env_kwargs) else: env = GymEnv(env) elif isinstance(env, GymEnv): env = env elif callable(env): env = env(**env_kwargs) else: print("Unsupported environment format") raise AttributeError if base_seed is not None: env.set_seed(base_seed) np.random.seed(base_seed) else: np.random.seed() horizon = min(horizon, env.horizon) paths = [] for ep in range(num_traj): # seeding if base_seed is not None: seed = base_seed + ep env.set_seed(seed) np.random.seed(seed) observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() if init_states_per_cpu is not None: o = env.set_env_state(init_states_per_cpu[ep]) assert o is not None, 'set_env_state of env ' + env.env_id + ' returns None, should return observation' done = False t = 0 while t < horizon and done != True: a, agent_info = policy.get_action(o) if eval_mode: a = agent_info['evaluation'] env_info_base = env.get_env_infos() next_o, r, done, env_info_step = env.step(a) # below is important to ensure correct env_infos for the timestep env_info = env_info_step if env_info_base == {} else env_info_base observations.append(o) actions.append(a) rewards.append(r) agent_infos.append(agent_info) env_infos.append(env_info) o = next_o t += 1 path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), terminated=done) paths.append(path) del (env) return paths
def main(include, env_name, path_file, loop, render, save): # get env if include is not "": exec("import " + include) env = GymEnv(env_name) # load paths paths = pickle.load(open(path_file, 'rb')) # playback paths pbk_paths = [] for i_loop in range(loop): for path in paths: obs = [] act = [] rewards = [] env_infos = [] states = [] # initialize paths if "state" in path.keys(): env.reset(init_state=path["state"][0]) else: env.reset() # playback input path if render: env.env.env.mujoco_render_frames = True o = env.get_obs() for i_step in range(path['actions'].shape[0]): a = path['actions'][i_step] s = env.get_env_state() onext, r, d, info = env.step(a) # t = t+1 if render: env.render() obs.append(o) o = onext act.append(a) rewards.append(r) env_infos.append(info) states.append(env.get_env_state()) if render: env.env.env.mujoco_render_frames = True # create output paths pbk_path = dict( observations=np.array(obs), actions=np.array(act), rewards=np.array(rewards), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), states=states) pbk_paths.append(pbk_path) print("Finished playback loop:{}".format(i_loop)) # save output paths if save: pbk_file_name = path_file[:path_file.rfind('.')] + "_playback.pickle" pickle.dump(pbk_paths, open(pbk_file_name, 'wb')) print("Saved: " + pbk_file_name) return pbk_paths
def do_evaluation_rollout(N, policy, T=1e6, env=None, env_name=None, pegasus_seed=None): """ params: N : number of trajectories policy : policy to be used to sample the data T : maximum length of trajectory env : env object to sample from env_name : name of env to be sampled from (one of env or env_name must be specified) pegasus_seed : seed for environment (numpy speed must be set externally) """ if env_name is None and env is None: print("No environment specified! Error will be raised") if env is None: env = get_environment(env_name) if pegasus_seed is not None: try: env.env._seed(pegasus_seed) except AttributeError as e: env.env.seed(pegasus_seed) T = min(T, env.horizon) # print("####### Worker started #######") paths = [] for ep in range(N): # Set pegasus seed if asked if pegasus_seed is not None: seed = pegasus_seed + ep try: env.env._seed(seed) except AttributeError as e: env.env.seed(seed) np.random.seed(seed) else: np.random.seed() observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() done = False t = 0 while t < T and done != True: _, agent_info = policy.get_action(o) a = agent_info['evaluation'] next_o, r, done, env_info = env.step(a) # observations.append(o.ravel()) observations.append(o) actions.append(a) rewards.append(r) agent_infos.append(agent_info) env_infos.append(env_info) o = next_o t += 1 path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), terminated=done) paths.append(path) # print("====== Worker finished ======") return paths
def trajectory_generator(self, beta, dagger_ep): if self.env_name is None and self.env is None: print("No environment specified! Error will be raised") if self.env is None: self.env = get_environment(self.env_name) T = self.env.horizon paths = [] print('Generating trajectories') for ep in tqdm(range(self.num_traj_gen)): if self.seed is not None: seed = self.seed + ep + dagger_ep * self.num_traj_gen self.env.env.env._seed(seed) np.random.seed(seed) else: np.random.seed() observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] path_image_pixels = [] all_robot_info = [] o = self.env.reset() robot_info = None if self.has_robot_info: o, env_info = self.env.reset() robot_info = env_info['robot_info'] done = False t = 0 while t < T and done != True: r = np.random.random() image_pix = self.env.get_pixels(frame_size=FRAME_SIZE, camera_name=self.camera_name, device_id=self.device_id) a_expert, agent_info_expert = self.expert_policy.get_action(o) img = image_pix prev_img = image_pix prev_prev_img = image_pix if t > 0: prev_img = path_image_pixels[t - 1] if t > 1: prev_prev_img = path_image_pixels[t - 2] prev_prev_img = np.expand_dims(prev_prev_img, axis=0) prev_img = np.expand_dims(prev_img, axis=0) img = np.expand_dims(img, axis=0) o_img = np.concatenate((prev_prev_img, prev_img, img), axis=0) a_viz, agent_info_viz = self.viz_policy.get_action( o_img, use_seq=self.use_seq, use_cuda=self.use_cuda, robot_info=robot_info) if r <= beta: a = agent_info_expert['evaluation'] agent_info = agent_info_expert else: a = a_viz agent_info = agent_info_viz next_o, r, done, env_info = self.env.step(a) observations.append(o) actions.append(agent_info_expert['evaluation']) rewards.append(r) agent_infos.append(agent_info) env_infos.append(env_info) path_image_pixels.append(image_pix) if self.has_robot_info: all_robot_info.append(robot_info) robot_info = env_info['robot_info'] o = next_o t += 1 path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), terminated=done, image_pixels=np.array(path_image_pixels)) if self.has_robot_info: path['robot_info'] = np.array(all_robot_info) paths.append(path) return paths
def trajectory_generator(self, beta, dagger_ep): if self.env_name is None and self.env is None: print("No environment specified! Error will be raised") if self.env is None: self.env = get_environment(self.env_name) T = self.env.horizon paths = [] print('Generating trajectories') for ep in tqdm(range(self.num_traj_gen)): if self.seed is not None: seed = self.seed + ep + dagger_ep * self.num_traj_gen self.env.env.env._seed(seed) np.random.seed(seed) else: np.random.seed() observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] path_image_pixels = [] all_robot_info = [] o = self.env.reset() robot_info = None if self.has_robot_info: o, env_info = self.env.reset() robot_info = env_info['robot_info'] done = False t = 0 while t < T and done != True: r = np.random.random() image_pix = self.env.get_pixels(frame_size=FRAME_SIZE, camera_name=self.camera_name, device_id=self.device_id) a_expert, agent_info_expert = self.expert_policy.get_action(o) img = image_pix prev_img = image_pix prev_prev_img = image_pix if t > 0: prev_img = path_image_pixels[t - 1] if t > 1: prev_prev_img = path_image_pixels[t - 2] prev_prev_img = np.expand_dims(prev_prev_img, axis=0) prev_img = np.expand_dims(prev_img, axis=0) img = np.expand_dims(img, axis=0) o_img = np.concatenate((prev_prev_img, prev_img, img), axis=0) a_viz, agent_info_viz = self.viz_policy.get_action(o_img, use_seq=self.use_seq, use_cuda=self.use_cuda, robot_info=robot_info) if r <= beta: a = agent_info_expert['evaluation'] agent_info = agent_info_expert else: a = a_viz agent_info = agent_info_viz next_o, r, done, env_info = self.env.step(a) observations.append(o) actions.append(agent_info_expert['evaluation']) rewards.append(r) agent_infos.append(agent_info) env_infos.append(env_info) path_image_pixels.append(image_pix) if self.has_robot_info: all_robot_info.append(robot_info) robot_info = env_info['robot_info'] o = next_o t += 1 path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), terminated=done, image_pixels=np.array(path_image_pixels) ) if self.has_robot_info: path['robot_info'] = np.array(all_robot_info) paths.append(path) return paths