def dump_video( env, policy, filename, ROWS=3, COLUMNS=6, do_timer=True, horizon=100, image_env=None, dirname=None, subdirname="rollouts", ): policy.train(False) # is this right/necessary? paths = [] num_channels = env.vae.input_channels frames = [] N = ROWS * COLUMNS for i in range(N): rollout_dir = osp.join(dirname, subdirname, str(i)) os.makedirs(rollout_dir, exist_ok=True) start = time.time() paths.append( rollout( env, policy, frames, max_path_length=horizon, animated=False, image_env=image_env, )) rollout_frames = frames[-101:] goal_img = np.flip(rollout_frames[0][:84, :84, :], 0) scipy.misc.imsave(rollout_dir + "/goal.png", goal_img) goal_img = np.flip(rollout_frames[1][:84, :84, :], 0) scipy.misc.imsave(rollout_dir + "/z_goal.png", goal_img) for j in range(0, 101, 1): img = np.flip(rollout_frames[j][84:, :84, :], 0) scipy.misc.imsave(rollout_dir + "/" + str(j) + ".png", img) if do_timer: print(i, time.time() - start) frames = np.array(frames, dtype=np.uint8).reshape( (N, horizon + 1, H, W, num_channels)) f1 = [] for k1 in range(COLUMNS): f2 = [] for k2 in range(ROWS): k = k1 * ROWS + k2 f2.append(frames[k:k + 1, :, :, :, :].reshape( (horizon + 1, H, W, num_channels))) f1.append(np.concatenate(f2, axis=1)) outputdata = np.concatenate(f1, axis=2) skvideo.io.vwrite(filename, outputdata) print("Saved video to ", filename) return paths
def obtain_samples(self): paths = [] n_steps_total = 0 while n_steps_total + self.max_path_length <= self.max_samples: path = rollout(self.env, self.policy, max_path_length=self.max_path_length, animated=self.render) paths.append(path) n_steps_total += len(path['observations']) return paths
def simulate_policy(args): dir = args.path data = joblib.load("{}/params.pkl".format(dir)) env = data['env'] model_params = data['model_params'] mpc_params = data['mpc_params'] # dyn_model = NNDynamicsModel(env=env, **model_params) # mpc_controller = MPCcontroller(env=env, # dyn_model=dyn_model, # **mpc_params) tf_path_meta = "{}/tf_out-0.meta".format(dir) tf_path = "{}/tf_out-0".format(dir) with tf.Session() as sess: new_saver = tf.train.import_meta_graph(tf_path_meta) new_saver.restore(sess, tf_path) env = data['env'] if isinstance(env, RemoteRolloutEnv): env = env._wrapped_env print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.to(ptu.device) if args.pause: import ipdb ipdb.set_trace() if isinstance(policy, PyTorchModule): policy.train(False) while True: try: path = rollout( env, policy, max_path_length=args.H, animated=True, ) env.log_diagnostics([path]) policy.log_diagnostics([path]) logger.dump_tabular() # Hack for now. Not sure why rollout assumes that close is an # keyword argument except TypeError as e: if (str(e) != "render() got an unexpected keyword " "argument 'close'"): raise e
def create_policy(variant): bottom_snapshot = joblib.load(variant['bottom_path']) column_snapshot = joblib.load(variant['column_path']) policy = variant['combiner_class']( policy1=bottom_snapshot['naf_policy'], policy2=column_snapshot['naf_policy'], ) env = bottom_snapshot['env'] logger.save_itr_params(0, dict( policy=policy, env=env, )) path = rollout( env, policy, max_path_length=variant['max_path_length'], animated=variant['render'], ) env.log_diagnostics([path]) logger.dump_tabular()
def pretrain(self): if (self.num_paths_for_normalization == 0 or (self.obs_normalizer is None and self.action_normalizer is None)): return pretrain_paths = [] random_policy = RandomPolicy(self.env.action_space) while len(pretrain_paths) < self.num_paths_for_normalization: path = rollout(self.env, random_policy, self.max_path_length) pretrain_paths.append(path) ob_mean, ob_std, delta_mean, delta_std, ac_mean, ac_std = ( compute_normalization(pretrain_paths)) if self.obs_normalizer is not None: self.obs_normalizer.set_mean(ob_mean) self.obs_normalizer.set_std(ob_std) if self.delta_normalizer is not None: self.delta_normalizer.set_mean(delta_mean) self.delta_normalizer.set_std(delta_std) if self.action_normalizer is not None: self.action_normalizer.set_mean(ac_mean) self.action_normalizer.set_std(ac_std)
def simulate_policy(args): data = joblib.load(args.file) policy = data['mpc_controller'] env = data['env'] print("Policy loaded") if args.pause: import ipdb ipdb.set_trace() policy.cost_fn = env.cost_fn policy.env = env if args.T: policy.mpc_horizon = args.T paths = [] while True: paths.append( rollout( env, policy, max_path_length=args.H, animated=True, )) if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths) logger.dump_tabular()
logger.dump_tabular() else: for weight in [1]: for num_simulated_paths in [args.npath]: print("") print("weight", weight) print("num_simulated_paths", num_simulated_paths) policy = CollocationMpcController( env, implicit_model, original_policy, num_simulated_paths=num_simulated_paths, feasibility_weight=weight, ) policy.train(False) paths = [] for _ in range(5): goal = env.sample_goal_for_rollout() env.set_goal(goal) paths.append(rollout( env, policy, max_path_length=args.H, animated=not args.hide, )) if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths) final_distance = logger.get_table_dict()['Final Euclidean distance to goal Mean'] print("final distance", final_distance) # logger.dump_tabular()
env = data['env'] num_samples = 1000 resolution = 10 if 'policy' in data: trained_mpc_controller = data['policy'] else: trained_mpc_controller = data['exploration_policy'].policy trained_mpc_controller.env = env trained_mpc_controller.cost_fn = env.cost_fn trained_mpc_controller.num_simulated_paths = args.npath trained_mpc_controller.horizon = 1 if args.justsim: while True: path = rollout( env, trained_mpc_controller, max_path_length=args.H, animated=not args.hide, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() else: model = data['model'] tdm = ModelToTdm(model) for weight in [100]: for num_simulated_paths in [args.npath]: print("") print("weight", weight) print("num_simulated_paths", num_simulated_paths)
def main(): model_data = joblib.load(MODEL_PATH) model = model_data['model'] tdm_data = joblib.load(TDM_PATH) env = tdm_data['env'] qf = tdm_data['qf'] variant_path = Path(TDM_PATH).parents[0] / 'variant.json' variant = json.load(variant_path.open()) reward_scale = variant['sac_tdm_kwargs']['base_kwargs']['reward_scale'] tdm = ImplicitModel(qf, None) random_policy = RandomPolicy(env.action_space) H = 10 path = rollout(env, random_policy, max_path_length=H) model_distance_preds = [] tdm_distance_preds = [] for ob, action, next_ob in zip( path['observations'], path['actions'], path['next_observations'], ): obs = ob[None] actions = action[None] next_feature = env.convert_ob_to_goal(next_ob) model_next_ob_pred = ob + model.eval_np(obs, actions)[0] model_distance_pred = np.abs( env.convert_ob_to_goal(model_next_ob_pred) - next_feature) tdm_next_feature_pred = get_feasible_goal(env, tdm, ob, action) tdm_distance_pred = np.abs(tdm_next_feature_pred - next_feature) model_distance_preds.append(model_distance_pred) tdm_distance_preds.append(tdm_distance_pred) model_distances = np.array(model_distance_preds) tdm_distances = np.array(tdm_distance_preds) ts = np.arange(len(model_distance_preds)) num_dim = model_distances[0].size ind = np.arange(num_dim) width = 0.35 fig, ax = plt.subplots() means = model_distances.mean(axis=0) stds = model_distances.std(axis=0) rects1 = ax.bar(ind, means, width, color='r', yerr=stds) means = tdm_distances.mean(axis=0) stds = tdm_distances.std(axis=0) rects2 = ax.bar(ind + width, means, width, color='y', yerr=stds) ax.legend((rects1[0], rects2[0]), ('Model', 'TDM')) ax.set_xlabel("Dimension") ax.set_ylabel("Absolute Error") ax.set_xticks(ind + width / 2) ax.set_xticklabels(list(map(str, ind))) plt.show() plt.subplot(2, 1, 1) for i in range(num_dim): plt.plot( ts, model_distances[:, i], label=str(i), ) plt.xlabel("Time") plt.ylabel("Absolute Error") plt.title("Model") plt.legend() plt.subplot(2, 1, 2) for i in range(num_dim): plt.plot( ts, tdm_distances[:, i], label=str(i), ) plt.xlabel("Time") plt.ylabel("Absolute Error") plt.title("TDM") plt.legend() plt.show() goal = env.convert_ob_to_goal(path['observations'][H // 2].copy()) path = rollout(env, random_policy, max_path_length=H) model_distance_preds = [] tdm_distance_preds = [] for ob, action, next_ob in zip( path['observations'], path['actions'], path['next_observations'], ): model_next_ob_pred = ob + model.eval_np(ob[None], action[None])[0] model_distance_pred = np.linalg.norm( env.convert_ob_to_goal(model_next_ob_pred) - goal) tdm_distance_pred = tdm.eval_np( ob[None], goal[None], np.zeros((1, 1)), action[None], )[0] / reward_scale model_distance_preds.append(model_distance_pred) tdm_distance_preds.append(tdm_distance_pred) fig, ax = plt.subplots() means = model_distances.mean(axis=0) stds = model_distances.std(axis=0) rects1 = ax.bar(ind, means, width, color='r', yerr=stds) means = tdm_distances.mean(axis=0) stds = tdm_distances.std(axis=0) rects2 = ax.bar(ind + width, means, width, color='y', yerr=stds) ax.legend((rects1[0], rects2[0]), ('Model', 'TDM')) ax.set_xlabel("Dimension") ax.set_ylabel("Error To Random Goal State") ax.set_xticks(ind + width / 2) ax.set_xticklabels(list(map(str, ind))) plt.show()
'/home/vitchyr/git/rllab-rail/railrl/data/s3/09-14-pusher-3dof-reacher-naf-yolo-bottom-right/09-14_pusher-3dof-reacher-naf-yolo_bottom-right_2017_09_14_17_52_45_0007/params.pkl' ), reach_bottom_middle=( '/home/vitchyr/git/rllab-rail/railrl/data/s3/09-14-pusher-3dof-reacher-naf-yolo-bottom-middle/09-14_pusher-3dof-reacher-naf-yolo_bottom-middle_2017_09_14_17_52_45_0005/params.pkl' ), ) for name, full_path in files.items(): name = name.replace('_', '-') # in case Tuomas's script cares data = joblib.load(full_path) if 'policy' in data: policy = data['policy'] else: policy = data['naf_policy'] env = data['env'] print(name) pos_lst = list() for i in range(100): path = rollout(env, policy, max_path_length=300, animated=False) pos_lst.append(path['final_observation'][-3:-1]) pos_all = np.stack(pos_lst) outfile = os.path.join('data/papers/icra2018/results/pusher/naf', name + '.txt') np.savetxt(outfile, pos_all)
horizontal_pos = 'bottom' ddpg1_snapshot_path, ddpg2_snapshot_path, x_goal, y_goal = ( get_snapshots_and_goal( vertical_pos=vertical_pos, horizontal_pos=horizontal_pos, ) ) env_params = dict( goal=(x_goal, y_goal), ) env = PusherEnv3DOF(**env_params) env = normalize(env) ddpg1_snapshot_dict = joblib.load(ddpg1_snapshot_path) ddpg2_snapshot_dict = joblib.load(ddpg2_snapshot_path) policy = AveragerPolicy( ddpg1_snapshot_dict['policy'], ddpg2_snapshot_dict['policy'], ) while True: path = rollout( env, policy, max_path_length=args.H, animated=True, ) env.log_diagnostics([path]) policy.log_diagnostics([path]) logger.dump_tabular()