Exemple #1
0
def dump_video(
    env,
    policy,
    filename,
    ROWS=3,
    COLUMNS=6,
    do_timer=True,
    horizon=100,
    image_env=None,
    dirname=None,
    subdirname="rollouts",
):
    policy.train(False)  # is this right/necessary?
    paths = []
    num_channels = env.vae.input_channels
    frames = []
    N = ROWS * COLUMNS
    for i in range(N):
        rollout_dir = osp.join(dirname, subdirname, str(i))
        os.makedirs(rollout_dir, exist_ok=True)
        start = time.time()
        paths.append(
            rollout(
                env,
                policy,
                frames,
                max_path_length=horizon,
                animated=False,
                image_env=image_env,
            ))
        rollout_frames = frames[-101:]
        goal_img = np.flip(rollout_frames[0][:84, :84, :], 0)
        scipy.misc.imsave(rollout_dir + "/goal.png", goal_img)
        goal_img = np.flip(rollout_frames[1][:84, :84, :], 0)
        scipy.misc.imsave(rollout_dir + "/z_goal.png", goal_img)
        for j in range(0, 101, 1):
            img = np.flip(rollout_frames[j][84:, :84, :], 0)
            scipy.misc.imsave(rollout_dir + "/" + str(j) + ".png", img)
        if do_timer:
            print(i, time.time() - start)

    frames = np.array(frames, dtype=np.uint8).reshape(
        (N, horizon + 1, H, W, num_channels))
    f1 = []
    for k1 in range(COLUMNS):
        f2 = []
        for k2 in range(ROWS):
            k = k1 * ROWS + k2
            f2.append(frames[k:k + 1, :, :, :, :].reshape(
                (horizon + 1, H, W, num_channels)))
        f1.append(np.concatenate(f2, axis=1))
    outputdata = np.concatenate(f1, axis=2)
    skvideo.io.vwrite(filename, outputdata)
    print("Saved video to ", filename)

    return paths
 def obtain_samples(self):
     paths = []
     n_steps_total = 0
     while n_steps_total + self.max_path_length <= self.max_samples:
         path = rollout(self.env,
                        self.policy,
                        max_path_length=self.max_path_length,
                        animated=self.render)
         paths.append(path)
         n_steps_total += len(path['observations'])
     return paths
Exemple #3
0
def simulate_policy(args):
    dir = args.path
    data = joblib.load("{}/params.pkl".format(dir))
    env = data['env']
    model_params = data['model_params']
    mpc_params = data['mpc_params']
    # dyn_model = NNDynamicsModel(env=env, **model_params)
    # mpc_controller = MPCcontroller(env=env,
    #                                dyn_model=dyn_model,
    #                                **mpc_params)
    tf_path_meta = "{}/tf_out-0.meta".format(dir)
    tf_path = "{}/tf_out-0".format(dir)

    with tf.Session() as sess:
        new_saver = tf.train.import_meta_graph(tf_path_meta)
        new_saver.restore(sess, tf_path)

    env = data['env']
    if isinstance(env, RemoteRolloutEnv):
        env = env._wrapped_env
    print("Policy loaded")
    if args.gpu:
        set_gpu_mode(True)
        policy.to(ptu.device)
    if args.pause:
        import ipdb
        ipdb.set_trace()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    while True:
        try:
            path = rollout(
                env,
                policy,
                max_path_length=args.H,
                animated=True,
            )
            env.log_diagnostics([path])
            policy.log_diagnostics([path])
            logger.dump_tabular()
        # Hack for now. Not sure why rollout assumes that close is an
        # keyword argument
        except TypeError as e:
            if (str(e) != "render() got an unexpected keyword "
                    "argument 'close'"):
                raise e
def create_policy(variant):
    bottom_snapshot = joblib.load(variant['bottom_path'])
    column_snapshot = joblib.load(variant['column_path'])
    policy = variant['combiner_class'](
        policy1=bottom_snapshot['naf_policy'],
        policy2=column_snapshot['naf_policy'],
    )
    env = bottom_snapshot['env']
    logger.save_itr_params(0, dict(
        policy=policy,
        env=env,
    ))
    path = rollout(
        env,
        policy,
        max_path_length=variant['max_path_length'],
        animated=variant['render'],
    )
    env.log_diagnostics([path])
    logger.dump_tabular()
    def pretrain(self):
        if (self.num_paths_for_normalization == 0 or
            (self.obs_normalizer is None and self.action_normalizer is None)):
            return

        pretrain_paths = []
        random_policy = RandomPolicy(self.env.action_space)
        while len(pretrain_paths) < self.num_paths_for_normalization:
            path = rollout(self.env, random_policy, self.max_path_length)
            pretrain_paths.append(path)
        ob_mean, ob_std, delta_mean, delta_std, ac_mean, ac_std = (
            compute_normalization(pretrain_paths))
        if self.obs_normalizer is not None:
            self.obs_normalizer.set_mean(ob_mean)
            self.obs_normalizer.set_std(ob_std)
        if self.delta_normalizer is not None:
            self.delta_normalizer.set_mean(delta_mean)
            self.delta_normalizer.set_std(delta_std)
        if self.action_normalizer is not None:
            self.action_normalizer.set_mean(ac_mean)
            self.action_normalizer.set_std(ac_std)
Exemple #6
0
def simulate_policy(args):
    data = joblib.load(args.file)
    policy = data['mpc_controller']
    env = data['env']
    print("Policy loaded")
    if args.pause:
        import ipdb
        ipdb.set_trace()
    policy.cost_fn = env.cost_fn
    policy.env = env
    if args.T:
        policy.mpc_horizon = args.T
    paths = []
    while True:
        paths.append(
            rollout(
                env,
                policy,
                max_path_length=args.H,
                animated=True,
            ))
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics(paths)
        logger.dump_tabular()
Exemple #7
0
            logger.dump_tabular()
    else:
        for weight in [1]:
            for num_simulated_paths in [args.npath]:
                print("")
                print("weight", weight)
                print("num_simulated_paths", num_simulated_paths)
                policy = CollocationMpcController(
                    env,
                    implicit_model,
                    original_policy,
                    num_simulated_paths=num_simulated_paths,
                    feasibility_weight=weight,
                )
                policy.train(False)
                paths = []
                for _ in range(5):
                    goal = env.sample_goal_for_rollout()
                    env.set_goal(goal)
                    paths.append(rollout(
                        env,
                        policy,
                        max_path_length=args.H,
                        animated=not args.hide,
                    ))
                if hasattr(env, "log_diagnostics"):
                    env.log_diagnostics(paths)
                final_distance = logger.get_table_dict()['Final Euclidean distance to goal Mean']
                print("final distance", final_distance)
                # logger.dump_tabular()
Exemple #8
0
    env = data['env']
    num_samples = 1000
    resolution = 10
    if 'policy' in data:
        trained_mpc_controller = data['policy']
    else:
        trained_mpc_controller = data['exploration_policy'].policy
    trained_mpc_controller.env = env
    trained_mpc_controller.cost_fn = env.cost_fn
    trained_mpc_controller.num_simulated_paths = args.npath
    trained_mpc_controller.horizon = 1
    if args.justsim:
        while True:
            path = rollout(
                env,
                trained_mpc_controller,
                max_path_length=args.H,
                animated=not args.hide,
            )
            if hasattr(env, "log_diagnostics"):
                env.log_diagnostics([path])
            logger.dump_tabular()
    else:

        model = data['model']
        tdm = ModelToTdm(model)

        for weight in [100]:
            for num_simulated_paths in [args.npath]:
                print("")
                print("weight", weight)
                print("num_simulated_paths", num_simulated_paths)
def main():
    model_data = joblib.load(MODEL_PATH)
    model = model_data['model']
    tdm_data = joblib.load(TDM_PATH)
    env = tdm_data['env']
    qf = tdm_data['qf']
    variant_path = Path(TDM_PATH).parents[0] / 'variant.json'
    variant = json.load(variant_path.open())
    reward_scale = variant['sac_tdm_kwargs']['base_kwargs']['reward_scale']
    tdm = ImplicitModel(qf, None)
    random_policy = RandomPolicy(env.action_space)
    H = 10
    path = rollout(env, random_policy, max_path_length=H)

    model_distance_preds = []
    tdm_distance_preds = []
    for ob, action, next_ob in zip(
            path['observations'],
            path['actions'],
            path['next_observations'],
    ):
        obs = ob[None]
        actions = action[None]
        next_feature = env.convert_ob_to_goal(next_ob)
        model_next_ob_pred = ob + model.eval_np(obs, actions)[0]
        model_distance_pred = np.abs(
            env.convert_ob_to_goal(model_next_ob_pred) - next_feature)

        tdm_next_feature_pred = get_feasible_goal(env, tdm, ob, action)
        tdm_distance_pred = np.abs(tdm_next_feature_pred - next_feature)

        model_distance_preds.append(model_distance_pred)
        tdm_distance_preds.append(tdm_distance_pred)

    model_distances = np.array(model_distance_preds)
    tdm_distances = np.array(tdm_distance_preds)
    ts = np.arange(len(model_distance_preds))
    num_dim = model_distances[0].size
    ind = np.arange(num_dim)
    width = 0.35

    fig, ax = plt.subplots()
    means = model_distances.mean(axis=0)
    stds = model_distances.std(axis=0)
    rects1 = ax.bar(ind, means, width, color='r', yerr=stds)

    means = tdm_distances.mean(axis=0)
    stds = tdm_distances.std(axis=0)
    rects2 = ax.bar(ind + width, means, width, color='y', yerr=stds)
    ax.legend((rects1[0], rects2[0]), ('Model', 'TDM'))
    ax.set_xlabel("Dimension")
    ax.set_ylabel("Absolute Error")
    ax.set_xticks(ind + width / 2)
    ax.set_xticklabels(list(map(str, ind)))

    plt.show()

    plt.subplot(2, 1, 1)
    for i in range(num_dim):
        plt.plot(
            ts,
            model_distances[:, i],
            label=str(i),
        )
    plt.xlabel("Time")
    plt.ylabel("Absolute Error")
    plt.title("Model")
    plt.legend()

    plt.subplot(2, 1, 2)
    for i in range(num_dim):
        plt.plot(
            ts,
            tdm_distances[:, i],
            label=str(i),
        )
    plt.xlabel("Time")
    plt.ylabel("Absolute Error")
    plt.title("TDM")
    plt.legend()
    plt.show()

    goal = env.convert_ob_to_goal(path['observations'][H // 2].copy())
    path = rollout(env, random_policy, max_path_length=H)

    model_distance_preds = []
    tdm_distance_preds = []
    for ob, action, next_ob in zip(
            path['observations'],
            path['actions'],
            path['next_observations'],
    ):
        model_next_ob_pred = ob + model.eval_np(ob[None], action[None])[0]
        model_distance_pred = np.linalg.norm(
            env.convert_ob_to_goal(model_next_ob_pred) - goal)

        tdm_distance_pred = tdm.eval_np(
            ob[None],
            goal[None],
            np.zeros((1, 1)),
            action[None],
        )[0] / reward_scale

        model_distance_preds.append(model_distance_pred)
        tdm_distance_preds.append(tdm_distance_pred)

    fig, ax = plt.subplots()
    means = model_distances.mean(axis=0)
    stds = model_distances.std(axis=0)
    rects1 = ax.bar(ind, means, width, color='r', yerr=stds)

    means = tdm_distances.mean(axis=0)
    stds = tdm_distances.std(axis=0)
    rects2 = ax.bar(ind + width, means, width, color='y', yerr=stds)
    ax.legend((rects1[0], rects2[0]), ('Model', 'TDM'))
    ax.set_xlabel("Dimension")
    ax.set_ylabel("Error To Random Goal State")
    ax.set_xticks(ind + width / 2)
    ax.set_xticklabels(list(map(str, ind)))

    plt.show()
        '/home/vitchyr/git/rllab-rail/railrl/data/s3/09-14-pusher-3dof-reacher-naf-yolo-bottom-right/09-14_pusher-3dof-reacher-naf-yolo_bottom-right_2017_09_14_17_52_45_0007/params.pkl'
    ),
    reach_bottom_middle=(
        '/home/vitchyr/git/rllab-rail/railrl/data/s3/09-14-pusher-3dof-reacher-naf-yolo-bottom-middle/09-14_pusher-3dof-reacher-naf-yolo_bottom-middle_2017_09_14_17_52_45_0005/params.pkl'
    ),
)


for name, full_path in files.items():
    name = name.replace('_', '-')  # in case Tuomas's script cares

    data = joblib.load(full_path)
    if 'policy' in data:
        policy = data['policy']
    else:
        policy = data['naf_policy']
    env = data['env']

    print(name)

    pos_lst = list()
    for i in range(100):
        path = rollout(env, policy, max_path_length=300, animated=False)
        pos_lst.append(path['final_observation'][-3:-1])

    pos_all = np.stack(pos_lst)

    outfile = os.path.join('data/papers/icra2018/results/pusher/naf',
                           name + '.txt')
    np.savetxt(outfile, pos_all)
    horizontal_pos = 'bottom'

    ddpg1_snapshot_path, ddpg2_snapshot_path, x_goal, y_goal = (
        get_snapshots_and_goal(
            vertical_pos=vertical_pos,
            horizontal_pos=horizontal_pos,
        )
    )
    env_params = dict(
        goal=(x_goal, y_goal),
    )
    env = PusherEnv3DOF(**env_params)
    env = normalize(env)
    ddpg1_snapshot_dict = joblib.load(ddpg1_snapshot_path)
    ddpg2_snapshot_dict = joblib.load(ddpg2_snapshot_path)
    policy = AveragerPolicy(
        ddpg1_snapshot_dict['policy'],
        ddpg2_snapshot_dict['policy'],
    )

    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            animated=True,
        )
        env.log_diagnostics([path])
        policy.log_diagnostics([path])
        logger.dump_tabular()