def save_videos(self, step, policy, save_name='sample_video', num_rollouts=2, teacher_dict={}, save_video=False, log_prefix=None, stochastic=True): policy.eval() self.env.set_level_distribution(self.curriculum_step) save_wandb = (save_video and not self.is_debug) paths, accuracy = rollout(self.env, policy, max_path_length=200, reset_every=self.rollouts_per_meta_task, stochastic=stochastic, batch_size=1, record_teacher=True, teacher_dict=teacher_dict, video_directory=self.exp_name, video_name=save_name + str(self.curriculum_step), num_rollouts=num_rollouts, save_wandb=save_wandb, save_locally=False, obs_preprocessor=self.obs_preprocessor,) if log_prefix is not None: logger.logkv(log_prefix + "Acc", accuracy) logger.logkv(log_prefix + "Reward", np.mean([sum(path['rewards']) for path in paths])) logger.logkv(log_prefix + "PathLength", np.mean([path['env_infos'][-1]['episode_length'] for path in paths])) logger.logkv(log_prefix + "Success", np.mean([path['env_infos'][-1]['success'] for path in paths])) return paths, accuracy
# with tf.Session(): # [rest of the code] with tf.Session() as sess: pkl_path = args.param print("Testing policy %s" % pkl_path) data = joblib.load(pkl_path) policy = data['policy'] env = normalize(ArmReacherEnv(side='right')) real_rewards = np.array([]) act_rewards = np.array([]) pos_rewards = np.array([]) for _ in range(args.num_rollouts): path = rollout(env, policy, max_path_length=args.max_path_length, animated=False, speedup=args.speedup, video_filename=args.video_filename, save_video=False, ignore_done=args.ignore_done, stochastic=args.stochastic) real_rewards = np.append(real_rewards, np.sum(path[0]['rewards'])) print("Real Reward Sum", np.sum(path[0]['rewards'])) #print(np.mean(path[0]['rewards'])) #print(len(path_act[0]['rewards'])) #print(len(path_pos[0]['rewards'])) print("Real Reward Avg") print(np.mean(real_rewards))
experimet_paths = load_exps_data(args.path, gap=args.gap_pkl, max=args.max_pkl) for exp_path in experimet_paths: max_path_length = exp_path['json'][ 'max_path_length'] if args.max_path_length is None else args.max_path_length if valid_experiment(exp_path['json']): for pkl_path in exp_path['pkl']: with tf.Session() as sess: print("\n Testing policy %s \n" % pkl_path) data = joblib.load(pkl_path) policy = data['policy'] if hasattr(policy, 'switch_to_pre_update'): policy.switch_to_pre_update() env = data['env'] video_filename = pkl_path.split('.')[0] + '.mp4' paths = rollout(env, policy, max_path_length=max_path_length, animated=False, speedup=args.speedup, video_filename=video_filename, save_video=True, ignore_done=args.ignore_done, stochastic=args.stochastic, num_rollouts=args.num_rollouts) print('Average Returns: ', np.mean([sum(path['rewards']) for path in paths])) tf.reset_default_graph()
env_class = getattr(iclr19_levels, args.class_name) # env_args = { # 'start_loc': args.start_loc, # 'include_holdout_obj': args.holdout_obj, # } # if args.grid_size is not None: # env_args['room_size'] = args.grid_size # if args.num_dists is not None: # env_args['num_dists'] = args.num_dists # e_new = env_class(**env_args) e_new = env_class(**arguments) e_new.use_teacher = args.use_teacher if args.use_teacher: teacher = PostActionAdvice(Bot, e_new) e_new.teacher = teacher e_new.teacher.set_feedback_type(args.feedback_type) env = rl2env(normalize(e_new)) video_filename = os.path.join(args.path, 'saved_video.mp4') paths, accuracy = rollout(env, policy, max_path_length=max_path_length, animated=args.animated, speedup=args.speedup, video_filename=video_filename, save_video=True, ignore_done=args.ignore_done, batch_size=1, stochastic=args.stochastic, num_rollouts=args.num_rollouts, reset_every=args.reset_every, record_teacher=True, reward_predictor=reward_predictor, dense_rewards=args.dense_rewards) print('Average Returns: ', np.mean([sum(path['rewards']) for path in paths])) print('Average Path Length: ', np.mean([path['env_infos'][-1]['episode_length'] for path in paths])) print('Average Success Rate: ', np.mean([path['env_infos'][-1]['success'] for path in paths])) tf.reset_default_graph()
print("Testing policy %s" % pkl_path) data = joblib.load(pkl_path) policy = data['policy'] env = normalize(ArmReacherEnv(side='right')) goal = data['env'].goal real_rewards = np.array([]) act_rewards = np.array([]) pos_rewards = np.array([]) for i in range(args.num_rollouts): path = rollout(env, policy, max_path_length=args.max_path_length, animated=False, speedup=args.speedup, video_filename=args.video_filename, save_video=False, ignore_done=args.ignore_done, stochastic=args.stochastic) mujoco_env_mimic_act = normalize(BlueEnv(actions=env.actions)) pickle.dump(env.actions, open("actions_ppo_0.pkl", "wb")) mujoco_env_mimic_act.goal = env.goal act_filename = "local_act_maml_" + str(i) + ".mp4" path_act = rollout(mujoco_env_mimic_act, policy, max_path_length=args.max_path_length, animated=True,