def simulate_policy(args): data = joblib.load(args.file) policy = data['policy'] env = data['env'] # wrapped_env = gym.make('HalfCheetahHolePositions-v{}'.format(1)) # disc = data['env'].disc # reward_params = data['env'].reward_params # unsupervised_reward_weight = data['env'].unsupervised_reward_weight # reward_weight = data['env'].reward_weight # env = DiscriminatorWrappedEnv(wrapped_env=wrapped_env, # disc=disc, # reward_params=reward_params, # unsupervised_reward_weight=unsupervised_reward_weight, # reward_weight=reward_weight) # env = data['env'] print("Policy loaded") if args.gpu: set_gpu_mode(True, 1) policy.cuda() if isinstance(policy, PyTorchModule): policy.train(False) while True: skill = np.random.randint(0, args.num_skills) path = rollout(env, policy, max_path_length=args.H, animated=True, skill=skill, deterministic=True) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def pretrain(self): if ( self.num_paths_for_normalization == 0 or (self.obs_normalizer is None and self.action_normalizer is None) ): return pretrain_paths = [] random_policy = RandomPolicy(self.env.action_space) while len(pretrain_paths) < self.num_paths_for_normalization: path = rollout(self.env, random_policy, self.max_path_length) pretrain_paths.append(path) ob_mean, ob_std, ac_mean, ac_std = ( compute_normalization(pretrain_paths) ) if self.obs_normalizer is not None: self.obs_normalizer.set_mean(ob_mean) self.obs_normalizer.set_std(ob_std) self.target_qf.obs_normalizer = self.obs_normalizer self.target_policy.obs_normalizer = self.obs_normalizer if self.action_normalizer is not None: self.action_normalizer.set_mean(ac_mean) self.action_normalizer.set_std(ac_std) self.target_qf.action_normalizer = self.action_normalizer self.target_policy.action_normalizer = self.action_normalizer
def obtain_samples(self, deterministic=False, max_samples=np.inf, max_trajs=np.inf, accum_context=True, resample=1): """ Obtains samples in the environment until either we reach either max_samples transitions or num_traj trajectories. The resample argument specifies how often (in trajectories) the agent will resample it's context. """ assert max_samples < np.inf or max_trajs < np.inf, "either max_samples or max_trajs must be finite" policy = MakeDeterministic( self.policy) if deterministic else self.policy paths = [] n_steps_total = 0 n_trajs = 0 while n_steps_total < max_samples and n_trajs < max_trajs: path = rollout(self.env, policy, max_path_length=self.max_path_length, accum_context=accum_context) # save the latent context that generated this trajectory path['context'] = policy.z.detach().cpu().numpy() paths.append(path) n_steps_total += len(path['observations']) n_trajs += 1 # don't we also want the option to resample z ever transition? if n_trajs % resample == 0: policy.sample_z() return paths, n_steps_total
def _train(self, policy, accum_context): for i in range(self.num_train_steps_per_itr): path = rollout(self.env, policy, max_path_length=self.max_path_length, accum_context=accum_context) self.model.train(path)
def obtain_samples(self, rollout_type="multitask"): paths = [] n_steps_total = 0 while n_steps_total + self.max_path_length <= self.max_samples: if self.randomize_env: self.env, env_name = self.alg.get_new_env() print(f"Evaluating {env_name}") if rollout_type == "multitask": path = multitask_rollout( self.env, self.policy, max_path_length=self.max_path_length, animated=False, observation_key='observation', desired_goal_key='desired_goal', get_action_kwargs=dict( return_stacked_softmax=False, mask=np.ones((1, self.env.unwrapped.num_blocks)), deterministic=True ) ) else: path = rollout( self.env, self.policy, max_path_length=self.max_path_length ) paths.append(path) n_steps_total += len(path['observations']) return paths
def simulate_policy(args): # data = joblib.load(args.file) data = torch.load(args.file) policy = data['evaluation/policy'] env = NormalizedBoxEnv(gym.make("BipedalWalker-v2")) print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() import cv2 video = cv2.VideoWriter('ppo_test.avi', cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 30, (640, 480)) index = 0 path = rollout( env, policy, max_path_length=args.H, render=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() for i, img in enumerate(path['images']): print(i) video.write(img[:, :, ::-1].astype(np.uint8)) cv2.imwrite("frames/ppo_test/%06d.png" % index, img[:, :, ::-1]) index += 1 video.release() print("wrote video")
def simulate_policy(args): # data = joblib.load(args.file) data = torch.load(args.file) policy = data['evaluation/policy'] env = NormalizedBoxEnv(Mani2dEnv()) # env.reset() # print(env.step(env.action_space.sample())) # sys.exit() # env = env.wrapped_env.unwrapped print("Policy loaded") if args.gpu: set_gpu_mode(True) # policy.cuda() # import cv2 # video = cv2.VideoWriter('diayn_bipedal_walker_hardcore.avi', cv2.VideoWriter_fourcc('M','J','P','G'), 30, (1200, 800)) index = 0 for skill in range(policy.stochastic_policy.skill_dim): print(skill) for _ in range(3): path = rollout( env, policy, skill, max_path_length=args.H, render=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def obtain_samples(self, deterministic=False, num_samples=None, num_rollouts=None, is_online=False): policy = MakeDeterministic( self.policy) if deterministic else self.policy paths = [] n_steps_total = 0 max_samp = self.max_samples if num_samples is not None: max_samp = num_samples # import pdb; pdb.set_trace() while n_steps_total + self.max_path_length <= max_samp: if num_rollouts is not None and num_rollouts <= len(paths): break path = rollout(self.env, policy, max_path_length=self.max_path_length, is_online=is_online) paths.append(path) n_steps_total += len(path['observations']) return paths
def obtain_samples(self, deterministic=False, max_samples=np.inf, max_trajs=np.inf, accum_context=True, resample=1, testing=False): assert max_samples < np.inf or max_trajs < np.inf, "either max_samples or max_trajs must be finite" policy = MakeDeterministic( self.policy) if deterministic else self.policy paths = [] n_steps_total = 0 n_trajs = 0 if self.itr <= self.num_train_itr: if self.tandem_train: self._train(policy, accum_context) self.itr += 1 else: for _ in range(self.num_train_itr): self._train(policy, accum_context) self.itr += 1 while n_steps_total < max_samples and n_trajs < max_trajs: if testing: path = rollout(self.env, policy, max_path_length=self.max_path_length, accum_context=accum_context) else: path = rollout(self.model, policy, max_path_length=self.max_path_length, accum_context=accum_context) # save the latent context that generated this trajectory path['context'] = policy.z.detach().cpu().numpy() paths.append(path) n_steps_total += len(path['observations']) n_trajs += 1 # don't we also want the option to resample z ever transition? if n_trajs % resample == 0: policy.sample_z() return paths, n_steps_total
def plot_separated_by_task(file): file = "./logs/sac-pointmass-multitask-5/sac-pointmass-multitask-5_2019_04_20_18_57_19_0000--s-0/params.pkl" data = joblib.load(file) policy = data['evaluation/policy'] env = data['evaluation/env'] # plt.figure(figsize=(8, 8)) num_goals = len(env.goals) has_circle = np.zeros(num_goals).astype(bool) fig, ax = plt.subplots(nrows=5, ncols=num_goals // 5) fig.set_size_inches(8, 8) print("Number of goals:", num_goals) for i in range(200): path = rollout( env, policy, max_path_length=100, animated=False, ) # print(path) obs = path["observations"] acts = path["actions"] goal_idx = np.argmax(obs[0, 2:]) plot_row, plot_col = goal_idx // 5, goal_idx % 5 goal_plot = ax[plot_row, plot_col] # Turn off goal_plot.set_yticklabels([]) goal_plot.set_xticklabels([]) start_x = obs[0, 0] start_y = obs[0, 1] goal_plot.scatter(start_x, start_y, color="green") goal_plot.scatter(obs[1:, 0], obs[1:, 1], color="b") goal_plot.quiver(obs[:, 0], obs[:, 1], acts[:, 0], acts[:, 1], angles='xy', scale_units='xy', scale=1, width=.005, headwidth=3, alpha=.9) # plt.annotate("start=({0}, {1})".format(start_x.round(4), start_y.round(4)), (start_x, start_y), xytext=(start_x-.5, start_y+.2)) final_x, final_y = obs[len(obs) - 1, 0], obs[len(obs) - 1, 1] # plt.annotate("end=({0}, {1})".format(final_x.round(4), final_y.round(4)), (final_x, final_y), xytext=(final_x-.5, final_y-.2)) goal = env.goals[goal_idx] goal_x, goal_y = goal[0], goal[1] # plt.annotate("goal=({0}, {1})".format(goal_x.round(4), goal_y.round(4)), (goal_x, goal_y), xytext=(goal_x-.5, goal_y+.1)) goal_plot.scatter(goal[0], goal[1], color="r") # Goal goal_plot.set_xlim(-1.5, 1.5) goal_plot.set_ylim(-1.5, 1.5) if not has_circle[goal_idx]: circle = plt.Circle((0, 0), 1, color='black', alpha=.5, fill=False) goal_plot.add_artist(circle) has_circle[goal_idx] = True
def get_eval_policy(self, task_identifier, mode='meta_test'): if task_identifier not in self.context_buffer.task_replay_buffers: # generate some rollouts with prior policy eval_context_buffer = MetaEnvReplayBuffer( self.context_buffer_size_per_task, self.training_env, policy_uses_pixels=self.policy_uses_pixels, ) n_steps_total = 0 steps_needed = self.num_context_trajs_for_exploration * self.max_path_length task_params = self.training_env.task_id_to_task_params( task_identifier) obs_task_params = self.training_env.task_id_to_obs_task_params( task_identifier) while n_steps_total < steps_needed: first_obs = self.training_env.reset( task_params=task_params, obs_task_params=obs_task_params) task_id = self.training_env.task_identifier z = self.prior_dist.sample() z = z.cpu().data.numpy()[0] post_cond_policy = PostCondMLPPolicyWrapper( self.main_policy, z) new_path = rollout(self.training_env, post_cond_policy, max_path_length=min( self.max_path_length + 1, steps_needed - n_steps_total + 1), do_not_reset=True, first_obs=first_obs) n_steps_total += len(new_path['observations']) eval_context_buffer.add_path(new_path, task_id) list_of_trajs = eval_context_buffer.sample_trajs_from_task( task_identifier, self.num_context_trajs_for_exploration, samples_per_traj=self.samples_per_traj) mask = None else: list_of_trajs = self.context_buffer.sample_trajs_from_task( task_identifier, self.num_context_trajs_for_exploration, ) mask = None enc_to_use = self.encoder mode = enc_to_use.training enc_to_use.eval() post_dist = enc_to_use([list_of_trajs], mask) enc_to_use.train(mode) z = post_dist.sample() z = z.cpu().data.numpy()[0] return PostCondMLPPolicyWrapper(self.main_policy, z)
def obtain_samples(self): paths = [] n_steps_total = 0 while n_steps_total + self.max_path_length <= self.max_samples: path = rollout(self.env, self.policy, max_path_length=self.max_path_length) paths.append(path) n_steps_total += len(path['observations']) return paths
def obtain_samples(self): paths = [] n_steps_total = 0 while n_steps_total + self.max_path_length <= self.max_samples: self.start_new_rollout() path = rollout(self.env, self.policy, max_path_length=self.max_path_length) self.handle_rollout_ending() paths.append(path) n_steps_total += len(path['observations']) return paths
def dump_video( env, policy, filename, ROWS=3, COLUMNS=6, do_timer=True, horizon=100, image_env=None, dirname=None, subdirname="rollouts", ): policy.train(False) # is this right/necessary? paths = [] num_channels = env.vae.input_channels frames = [] N = ROWS * COLUMNS for i in range(N): rollout_dir = osp.join(dirname, subdirname, str(i)) os.makedirs(rollout_dir, exist_ok=True) start = time.time() paths.append(rollout( env, policy, frames, max_path_length=horizon, animated=False, image_env=image_env, )) rollout_frames = frames[-101:] goal_img = np.flip(rollout_frames[0][:84, :84, :], 0) scipy.misc.imsave(rollout_dir+"/goal.png", goal_img) goal_img = np.flip(rollout_frames[1][:84, :84, :], 0) scipy.misc.imsave(rollout_dir+"/z_goal.png", goal_img) for j in range(0, 101, 1): img = np.flip(rollout_frames[j][84:, :84, :], 0) scipy.misc.imsave(rollout_dir+"/"+str(j)+".png", img) if do_timer: print(i, time.time() - start) frames = np.array(frames, dtype=np.uint8).reshape((N, horizon + 1, H, W, num_channels)) f1 = [] for k1 in range(COLUMNS): f2 = [] for k2 in range(ROWS): k = k1 * ROWS + k2 f2.append(frames[k:k+1, :, :, :, :].reshape((horizon + 1, H, W, num_channels))) f1.append(np.concatenate(f2, axis=1)) outputdata = np.concatenate(f1, axis=2) skvideo.io.vwrite(filename, outputdata) print("Saved video to ", filename) return paths
def obtain_samples(self, deterministic=False, num_samples=None, is_online=False): policy = MakeDeterministic(self.policy) if deterministic else self.policy paths = [] n_steps_total = 0 max_samp = self.max_samples if num_samples is not None: max_samp = num_samples while n_steps_total + self.max_path_length < max_samp: path = rollout( self.env, policy, max_path_length=self.max_path_length, is_online=is_online) paths.append(path) n_steps_total += len(path['observations']) return paths
def simulate_policy(args): data = joblib.load(args.file) policy = data['evaluation/policy'] env = data['evaluation/env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() while True: path = rollout( env, policy, max_path_length=args.H, animated=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def simulate_policy(args): # data = joblib.load(args.file) data = torch.load(args.file) policy = data['evaluation/policy'] env = NormalizedBoxEnv(gym.make(str(args.env))) # env = env.wrapped_env.unwrapped print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() import cv2 video = None # index = 0 for skill in range(policy.stochastic_policy.skill_dim): for trial in range(3): print("skill-{} rollout-{}".format(skill, trial)) path = rollout( env, policy, skill, max_path_length=args.H, render=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() for i, img in enumerate(path['images']): # print(i) # print(img.shape) if not video: video = cv2.VideoWriter( '{}.avi'.format(str(args.env)), cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 30, img.shape[:2]) video.write(img[:, :, ::-1].astype(np.uint8)) # cv2.imwrite("frames/diayn_bipedal_walker_hardcore.avi/%06d.png" % index, img[:,:,::-1]) # index += 1 video.release() print("wrote video")
def simulate_policy(args): data = joblib.load(args.file) # Pickle is internally used using joblib policy = data['policy'] env = data['env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, PyTorchModule): policy.train(False) while True: path = rollout( env, policy, max_path_length=args.H, animated=False, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def create_policy(variant): bottom_snapshot = joblib.load(variant['bottom_path']) column_snapshot = joblib.load(variant['column_path']) policy = variant['combiner_class']( policy1=bottom_snapshot['naf_policy'], policy2=column_snapshot['naf_policy'], ) env = bottom_snapshot['env'] logger.save_itr_params(0, dict( policy=policy, env=env, )) path = rollout( env, policy, max_path_length=variant['max_path_length'], animated=variant['render'], ) env.log_diagnostics([path]) logger.dump_tabular()
def simulate_policy(args): data = joblib.load(args.file) import ipdb; ipdb.set_trace() policy = data['exploration_policy'] # ? TODO, eval ? env = data['env'] print("Policy loaded") if args.gpu: set_gpu_mode("gpu") policy.cuda() if isinstance(policy, PyTorchModule): policy.train(False) while True: path = rollout( env, policy, max_path_length=args.H, animated=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def _run_policy(env, policy, num_rollouts): """ Takes in a trained policy, runs the policy for a specified number of rollouts, and returns the results of the experiment. :param fn: The path to the `params.pkl` file containing the trained policy. :param num_rollouts: The number of rollouts to experience. :return: A list of the `num_rollouts` recorded paths to be used for further analysis. """ start_states, final_states, goal_states, actions, paths = [], [], [], [], [] for i in range(num_rollouts): path = rollout( env, policy, max_path_length=100, animated=False, ) obs = path["observations"] acts = path["actions"] goal_idx = np.argmax(obs[0, 2:]) start_x, start_y = obs[0, 0], obs[0, 1] acts_x, acts_y = acts[:, 0], acts[:, 1] final_x, final_y = obs[len(obs) - 1, 0], obs[len(obs) - 1, 1] goal = env.goals[goal_idx] goal_x, goal_y = goal[0], goal[1] start_states.append(np.array([start_x, start_y])) final_states.append(np.array([final_x, final_y])) goal_states.append(np.array([goal_x, goal_y])) actions.append(np.array([acts_x, acts_y])) paths.append(path) return dict(start_states=np.array(start_states), final_states=np.array(final_states), goal_states=np.array(goal_states), actions=np.array(actions), paths=paths, env=env)
def pretrain(self): print('Generating initial contexts') # fill the contexts for task_params, obs_task_params in self.train_task_params_sampler: print('task') n_steps_total = 0 # print(n_steps_total) while n_steps_total < self.context_buffer_size_per_task: # print('------') # print(n_steps_total) # print(self.context_buffer_size_per_task) # print(self.max_path_length) first_obs = self.training_env.reset( task_params=task_params, obs_task_params=obs_task_params) task_id = self.training_env.task_identifier z = self.prior_dist.sample() z = z.cpu().data.numpy()[0] post_cond_policy = PostCondMLPPolicyWrapper( self.main_policy, z) new_path = rollout( self.training_env, post_cond_policy, max_path_length=min( self.max_path_length + 1, self.context_buffer_size_per_task - n_steps_total + 1), do_not_reset=True, first_obs=first_obs) # print(len(new_path['observations'])) n_steps_total += len(new_path['observations']) if self.add_context_rollouts_to_replay_buffer: self.replay_buffer.add_path(new_path, task_id) self.context_buffer.add_path(new_path, task_id) print('Generating initial replay buffer rollouts') super().pretrain()
def simulate_policy(args): data = joblib.load(args.file) policy = data['policy'] # env = data['env'] from rlkit.envs.mujoco_manip_env import MujocoManipEnv env = MujocoManipEnv("SawyerLiftEnv", render=True) print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, PyTorchModule): policy.train(False) while True: path = rollout( env, policy, max_path_length=args.H, animated=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def simulate_policy(args): manager_data = torch.load(args.manager_file) worker_data = torch.load(args.worker_file) policy = manager_data['evaluation/policy'] worker = worker_data['evaluation/policy'] env = NormalizedBoxEnv(gym.make(str(args.env))) print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() import cv2 video = cv2.VideoWriter('ppo_dirichlet_diayn_bipedal_walker_hardcore.avi', cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 30, (1200, 800)) index = 0 path = rollout( env, policy, worker, continuous=True, max_path_length=args.H, render=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() for i, img in enumerate(path['images']): print(i) video.write(img[:, :, ::-1].astype(np.uint8)) # cv2.imwrite("frames/ppo_dirichlet_diayn_policy_bipedal_walker_hardcore/%06d.png" % index, img[:,:,::-1]) index += 1 video.release() print("wrote video")
def simulate_policy(args): data = joblib.load(args.file) policy = data['mpc_controller'] env = data['env'] print("Policy loaded") if args.pause: import ipdb ipdb.set_trace() policy.cost_fn = env.cost_fn policy.env = env if args.T: policy.mpc_horizon = args.T paths = [] while True: paths.append( rollout( env, policy, max_path_length=args.H, animated=True, )) if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths) logger.dump_tabular()
def simulate_policy(args): data = joblib.load(args.file) policy = data['policy'] env = data['env'] print("Policy loaded") farmer = Farmer([('0.0.0.0', 1)]) env_to_sim = farmer.force_acq_env() if args.gpu: set_gpu_mode(True) policy.cuda() if isinstance(policy, PyTorchModule): policy.train(False) while True: path = rollout( env_to_sim, policy, max_path_length=args.H, animated=False, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def simulate_policy(args): data = joblib.load(args.file) cont = False if 'policies' in data: policy = data['policies'][0] else: policy = data['policy'] env = NormalizedBoxEnv(create_swingup()) #data['env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() data['qf1'].cuda() if isinstance(policy, PyTorchModule): policy.train(False) diayn = 'df' in data rnd = 'rf' in data if diayn: skills = len(data['eval_policy'].skill_vec) disc = data['df'] policy = OptionPolicy(policy, skills, cont) if args.gpu: disc.cuda() if isinstance(policy, PyTorchModule): disc.train(False) if rnd: data['rf'].cuda() data['pf'].cuda() data['qf1'].cuda() import cv2 video = cv2.VideoWriter('video.avi', cv2.VideoWriter_fourcc(*"H264"), 30, (640, 480)) index = 0 truth, pred = [], [] if cont: eps = 1 elif diayn: eps = skills * 2 else: eps = 5 Rs = [] for ep in range(eps): if diayn and not cont: z_index = ep // 2 policy.set_z(z_index) path = rollout( env, policy, max_path_length=args.H * skills if cont else args.H, animated=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() total_r = 0 if diayn: predictions = F.log_softmax( disc(torch.FloatTensor(path['observations']).cuda()), 1).cpu().detach().numpy() probs = predictions.max(1) labels = predictions.argmax(1) if cont: for k in range(skills): truth.extend([k] * 100) else: truth.extend([z_index] * len(labels)) pred.extend(labels.tolist()) if rnd: random_feats = data['rf'](torch.FloatTensor( path['observations']).cuda()) pred_feats = data['pf'](torch.FloatTensor( path['observations']).cuda()) i_rewards = ((random_feats - pred_feats)**2.0).sum(1).cpu().data.numpy() q_pred = data['qf1'](torch.FloatTensor(path['observations']).cuda(), torch.FloatTensor( path['actions']).cuda()).cpu().data.numpy() for i, (img, r, s) in enumerate( zip(path['images'], path['rewards'], path['observations'])): #video.write(img[:,:,::-1].astype(np.uint8)) total_r += r[0] img = img.copy() img = np.rot90(img, 3).copy() col = (255, 0, 255) cv2.putText(img, "step: %d" % (i + 1), (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) if diayn: if cont: cv2.putText(img, "z: %s" % str(truth[i]), (20, 80), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2, cv2.LINE_AA) else: cv2.putText(img, "z: %s" % str(z_index), (20, 80), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2, cv2.LINE_AA) cv2.putText(img, "disc_pred: %s (%.3f)" % (labels[i], probs[i]), (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2, cv2.LINE_AA) cv2.putText(img, "reward: %.3f" % r[0], (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2, cv2.LINE_AA) cv2.putText(img, "total reward: %.1f" % total_r, (20, 200), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2, cv2.LINE_AA) cv2.putText(img, "action: %s" % path['actions'][i], (20, 240), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2, cv2.LINE_AA) else: cv2.putText(img, "reward: %.1f" % r[0], (20, 80), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) cv2.putText(img, "total reward: %.1f" % total_r, (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) y = 120 if rnd: cv2.putText(img, "i reward (unscaled): %.3f" % i_rewards[i], (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) #cv2.rectangle(img, (20, 180), (20 + int(q_pred[i, 0]), 200), (255, 0, 255), -1) cv2.rectangle(img, (20, 200), (20 + int(i_rewards[i] * 10), 220), (255, 255, 0), -1) y = 220 try: y += 40 cv2.putText(img, "Q: %.3f" % q_pred[i], (20, y), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) except: y += 40 cv2.putText(img, "Q:" + str([q for q in q_pred[i]]), (20, y), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) y += 40 cv2.putText(img, str(["%.3f" % x for x in path['observations'][i]]), (20, y), cv2.FONT_HERSHEY_SIMPLEX, 1.0, col, 2, cv2.LINE_AA) try: cv2.imwrite("frames/%06d.png" % index, img[:, :, ::-1]) except: cv2.imwrite("frames/%06d.png" % index, img[:, :]) index += 1 if diayn: print(z_index, ":", total_r) Rs.append(total_r) print("best", np.argmax(Rs)) print("worst", np.argmin(Rs)) video.release() print("wrote video") if diayn: import sklearn from sklearn.metrics import confusion_matrix import matplotlib as mpl import itertools mpl.use('Agg') import matplotlib.pyplot as plt normalize = False classes = range(skills) cm = confusion_matrix(truth, pred) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) plt.colorbar() tick_marks = np.arange(skills) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) """ fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") """ plt.ylabel('True label') plt.xlabel('Predicted label') plt.tight_layout() plt.savefig("confusion.png")
args = parser.parse_args() vertical_pos = 'middle' horizontal_pos = 'bottom' ddpg1_snapshot_path, ddpg2_snapshot_path, x_goal, y_goal = ( get_snapshots_and_goal( vertical_pos=vertical_pos, horizontal_pos=horizontal_pos, )) env_params = dict(goal=(x_goal, y_goal), ) env = PusherEnv3DOF(**env_params) env = normalize(env) ddpg1_snapshot_dict = joblib.load(ddpg1_snapshot_path) ddpg2_snapshot_dict = joblib.load(ddpg2_snapshot_path) policy = AveragerPolicy( ddpg1_snapshot_dict['policy'], ddpg2_snapshot_dict['policy'], ) while True: path = rollout( env, policy, max_path_length=args.H, animated=True, ) env.log_diagnostics([path]) policy.log_diagnostics([path]) logger.dump_tabular()
def sim_policy(variant, path_to_exp, num_trajs=1, deterministic=False, save_video=False, animated=False): ''' simulate a trained policy adapting to a new task optionally save videos of the trajectories - requires ffmpeg :variant: experiment configuration dict :path_to_exp: path to exp folder :num_trajs: number of trajectories to simulate per task (default 1) :deterministic: if the policy is deterministic (default stochastic) :save_video: whether to generate and save a video (default False) ''' # create multi-task environment and sample tasks env = CameraWrapper( NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])), variant['util_params']['gpu_id']) if animated: env.render() tasks = env.get_all_task_idx() obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) eval_tasks = list(tasks[-variant['n_eval_tasks']:]) print('testing on {} test tasks, {} trajectories each'.format( len(eval_tasks), num_trajs)) # instantiate networks latent_dim = variant['latent_size'] context_encoder = latent_dim * 2 if variant['algo_params'][ 'use_information_bottleneck'] else latent_dim reward_dim = 1 net_size = variant['net_size'] recurrent = variant['algo_params']['recurrent'] encoder_model = RecurrentEncoder if recurrent else MlpEncoder context_encoder = encoder_model( hidden_sizes=[200, 200, 200], input_size=obs_dim + action_dim + reward_dim, output_size=context_encoder, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size, net_size], obs_dim=obs_dim + latent_dim, latent_dim=latent_dim, action_dim=action_dim, ) agent = PEARLAgent(latent_dim, context_encoder, policy, **variant['algo_params']) # deterministic eval if deterministic: agent = MakeDeterministic(agent) # load trained weights (otherwise simulate random policy) context_encoder.load_state_dict( torch.load(os.path.join(path_to_exp, 'context_encoder.pth'), map_location=torch.device('cpu'))) policy.load_state_dict( torch.load(os.path.join(path_to_exp, 'policy.pth'), map_location=torch.device('cpu'))) # loop through tasks collecting rollouts all_rets = [] video_frames = [] for idx in eval_tasks: env.reset_task(idx) agent.clear_z() paths = [] for n in range(num_trajs): path = rollout( env, agent, max_path_length=variant['algo_params']['num_steps_per_eval'], accum_context=True, animated=animated, save_frames=save_video) paths.append(path) if save_video: video_frames += [t['frame'] for t in path['env_infos']] if n >= variant['algo_params']['num_exp_traj_eval']: agent.infer_posterior(agent.context) all_rets.append([sum(p['rewards']) for p in paths]) if save_video: # save frames to file temporarily temp_dir = os.path.join(path_to_exp, 'temp') os.makedirs(temp_dir, exist_ok=True) for i, frm in enumerate(video_frames): frm.save(os.path.join(temp_dir, '%06d.jpg' % i)) video_filename = os.path.join(path_to_exp, 'video.mp4'.format(idx)) # run ffmpeg to make the video os.system('ffmpeg -i {}/%06d.jpg -vcodec mpeg4 {}'.format( temp_dir, video_filename)) # delete the frames shutil.rmtree(temp_dir) # compute average returns across tasks n = min([len(a) for a in all_rets]) rets = [a[:n] for a in all_rets] rets = np.mean(np.stack(rets), axis=0) for i, ret in enumerate(rets): print('trajectory {}, avg return: {} \n'.format(i, ret))
policy = data['evaluation/policy'] env = data['evaluation/env'] plt.figure(figsize=(8, 8)) num_goals = len(env.goals) final_states = [] goals = [] print("Number of goals:", num_goals) num_plotted = 0 # for i in range(10): while num_plotted < 100: path = rollout( env, policy, max_path_length=100, animated=False, ) # print(path) obs = path["observations"] acts = path["actions"] goal_idx = np.argmax(obs[0, 2:]) num_plotted += 1 plot_row, plot_col = goal_idx // 5, goal_idx % 5 start_x = obs[0, 0] start_y = obs[0, 1]