def _get_best_single_option_policy(self):
     best_returns = float('-inf')
     best_z = None
     for z in range(self._num_skills):
         fixed_z_policy = FixedOptionPolicy(self._policy, self._num_skills, z)
         paths = rollouts(self._eval_env, fixed_z_policy,
                          self._max_path_length, self._best_skill_n_rollouts,
                          render=False)
         total_returns = np.mean([path['rewards'].sum() for path in paths])
         if total_returns > best_returns:
             best_returns = total_returns
             best_z = z
     return FixedOptionPolicy(self._policy, self._num_skills, best_z)
Exemple #2
0
def dump_trace(picklefile: str, args):

    filename = '{}_{}_{}_trace.png'.format(os.path.splitext(picklefile)[0],
                                           args.dim_0, args.dim_1)

    with tf.Session(), tf.variable_scope(picklefile):
        data = joblib.load(picklefile)
        policy = data['policy']
        env = data['env']
        num_skills = data['policy'].observation_space.flat_dim - data['env'].spec.observation_space.flat_dim

        plt.figure(figsize=(6, 6))
        palette = sns.color_palette('hls', num_skills)
        with policy.deterministic(args.deterministic):
            skills = range(num_skills) if args.specific_skill == _use_all_skills else [args.specific_skill]
            for z in skills:
                fixed_z_policy = FixedOptionPolicy(policy, num_skills, z)
                for path_index in range(args.n_paths):
                    obs = env.reset()
                    if args.use_qpos:
                        qpos = env.wrapped_env.env.model.data.qpos[:, 0]
                        obs_vec = [qpos]
                    else:
                        obs_vec = [obs]
                    for t in range(args.max_path_length):
                        action, _ = fixed_z_policy.get_action(obs)
                        (obs, _, _, _) = env.step(action)
                        if args.use_qpos:
                            qpos = env.wrapped_env.env.model.data.qpos[:, 0]
                            obs_vec.append(qpos)
                        elif args.use_action:
                            obs_vec.append(action)
                        else:
                            obs_vec.append(obs)

                    obs_vec = np.array(obs_vec)
                    x = obs_vec[:, args.dim_0]
                    y = obs_vec[:, args.dim_1]
                    plt.plot(x, y, c=palette[z])

                    use_plot_lims = np.isfinite(env.observation_space.bounds).all()
                    if use_plot_lims:
                        xlim, ylim = np.asarray(env.observation_space.bounds).T
                        plt.xlim(xlim)
                        plt.ylim(ylim)

        plt.savefig(filename)
        plt.close()
    def _save_traces(self, filename):
        utils._make_dir(filename)
        obs_vec = []
        for z in range(self._num_skills):
            fixed_z_policy = FixedOptionPolicy(self._policy,
                                               self._num_skills, z)
            paths = rollouts(self._eval_env, fixed_z_policy,
                             self._max_path_length, n_paths=3,
                             render=False)
            obs_vec.append([path['observations'].tolist() for path in paths])

        with open(filename, 'w') as f:
            json.dump(obs_vec, f)
Exemple #4
0
def collect_expert_trajectories(expert_snapshot, max_path_length):
    tf.logging.info('Collecting expert trajectories')
    with tf.Session() as sess:
        data = joblib.load(expert_snapshot)
        policy = data['policy']
        env = data['env']
        num_skills = data['policy'].observation_space.flat_dim - data['env'].spec.observation_space.flat_dim
        traj_vec = []
        with policy.deterministic(True):
            for z in range(num_skills):
                fixed_z_policy = FixedOptionPolicy(policy, num_skills, z)
                new_paths = rollouts(env, fixed_z_policy,
                                     args.max_path_length, n_paths=1)
                path = new_paths[0]
                traj_vec.append(path)
    tf.reset_default_graph()
    return traj_vec
Exemple #5
0
def get_best_skill(policy, env, num_skills, max_path_length):
    tf.logging.info('Finding best skill to finetune...')
    reward_list = []
    with policy.deterministic(True):
        for z in range(num_skills):
            fixed_z_policy = FixedOptionPolicy(policy, num_skills, z)
            new_paths = rollouts(env,
                                 fixed_z_policy,
                                 max_path_length,
                                 n_paths=2)
            total_returns = np.mean(
                [path['rewards'].sum() for path in new_paths])
            tf.logging.info('Reward for skill %d = %.3f', z, total_returns)
            reward_list.append(total_returns)

    best_z = np.argmax(reward_list)
    tf.logging.info('Best skill found: z = %d, reward = %d', best_z,
                    reward_list[best_z])
    return best_z
Exemple #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('file', type=str, help='Path to the snapshot file.')
    parser.add_argument('--max-path-length', '-l', type=int, default=100)
    parser.add_argument('--speedup', '-s', type=float, default=1)
    parser.add_argument('--deterministic',
                        '-d',
                        dest='deterministic',
                        action='store_true')
    parser.add_argument('--no-deterministic',
                        '-nd',
                        dest='deterministic',
                        action='store_false')
    parser.add_argument('--separate_videos', type=bool, default=False)
    parser.set_defaults(deterministic=True)

    # unity_env args
    parser.add_argument('--idx', type=int, default=0)
    parser.add_argument('--no_graphics', type=bool, default=False)

    args = parser.parse_args()
    filename = os.path.splitext(args.file)[0] + '.avi'
    best_filename = os.path.splitext(args.file)[0] + '_best.avi'
    worst_filename = os.path.splitext(args.file)[0] + '_worst.avi'

    path_list = []
    reward_list = []

    with tf.Session() as sess:
        data = joblib.load(args.file)
        policy = data['policy']
        env = data['env']
        num_skills = data['policy'].observation_space.flat_dim - data[
            'env'].spec.observation_space.flat_dim

        with policy.deterministic(args.deterministic):
            for z in range(num_skills):
                fixed_z_policy = FixedOptionPolicy(policy, num_skills, z)
                new_paths = rollouts(env,
                                     fixed_z_policy,
                                     args.max_path_length,
                                     n_paths=1,
                                     render=True,
                                     render_mode='rgb_array')
                path_list.append(new_paths)
                total_returns = np.mean(
                    [path['rewards'].sum() for path in new_paths])
                reward_list.append(total_returns)

                if args.separate_videos:
                    base = os.path.splitext(args.file)[0]
                    end = '_skill_%02d.avi' % z
                    skill_filename = base + end
                    utils._save_video(new_paths, skill_filename)

                import csv
                file_path = args.file.split('/')
                file_path = file_path[-1].split('.')[0]
                file_path = './data/' + file_path
                if not os.path.exists(file_path):
                    os.mkdir(file_path)
                print(file_path)
                with open(file_path + '/path%02d.csv' % z, 'w',
                          newline='') as csvfile:
                    spamwriter = csv.writer(csvfile,
                                            delimiter=' ',
                                            quotechar='|',
                                            quoting=csv.QUOTE_MINIMAL)
                    spamwriter.writerow(
                        ['X', '-X', 'Y', '-Y', 'X_speed', 'Y_speed'])
                    for ob in path_list[-1][0]['observations']:
                        spamwriter.writerow(ob)

        if not args.separate_videos:
            paths = [path for paths in path_list for path in paths]
            utils._save_video(paths, filename)

        print('Best reward: %d' % np.max(reward_list))
        print('Worst reward: %d' % np.min(reward_list))
        # Record extra long videos for best and worst skills:
        best_z = np.argmax(reward_list)
        worst_z = np.argmin(reward_list)
        for (z, filename) in [(best_z, best_filename),
                              (worst_z, worst_filename)]:
            fixed_z_policy = FixedOptionPolicy(policy, num_skills, z)
            new_paths = rollouts(env,
                                 fixed_z_policy,
                                 3 * args.max_path_length,
                                 n_paths=1,
                                 render=True,
                                 render_mode='rgb_array')
            utils._save_video(new_paths, filename)
        env.terminate()
    args = parser.parse_args()
    filename = '{}_{}_{}_trace.png'.format(
        os.path.splitext(args.file)[0], args.dim_0, args.dim_1)

    with tf.Session() as sess:
        data = joblib.load(args.file)
        policy = data['policy']
        env = data['env']
        num_skills = data['policy'].observation_space.flat_dim - data[
            'env'].spec.observation_space.flat_dim

        plt.figure(figsize=(6, 6))
        palette = sns.color_palette('hls', num_skills)
        with policy.deterministic(args.deterministic):
            for z in range(num_skills):
                fixed_z_policy = FixedOptionPolicy(policy, num_skills, z)
                for path_index in range(args.n_paths):
                    obs = env.reset()
                    if args.use_qpos:
                        qpos = env.wrapped_env.env.model.data.qpos[:, 0]
                        obs_vec = [qpos]
                    else:
                        obs_vec = [obs]
                    for t in range(args.max_path_length):
                        action, _ = fixed_z_policy.get_action(obs)
                        (obs, _, _, _) = env.step(action)
                        if args.use_qpos:
                            qpos = env.wrapped_env.env.model.data.qpos[:, 0]
                            obs_vec.append(qpos)
                        elif args.use_action:
                            obs_vec.append(action)
Exemple #8
0
    filename = os.path.splitext(args.file)[0] + '.avi'
    best_filename = os.path.splitext(args.file)[0] + '_best.avi'
    worst_filename = os.path.splitext(args.file)[0] + '_worst.avi'

    path_list = []
    reward_list = []

    with tf.Session() as sess:
        data = joblib.load(args.file)
        policy = data['policy']
        env = data['env']
        num_skills = data['policy'].observation_space.flat_dim - data['env'].spec.observation_space.flat_dim

        with policy.deterministic(args.deterministic):
            for z in range(num_skills):
                fixed_z_policy = FixedOptionPolicy(policy, num_skills, z)
                new_paths = rollouts(env, fixed_z_policy,
                                  args.max_path_length, n_paths=1,
                                  render=True, render_mode='rgb_array')
                path_list.append(new_paths)
                total_returns = np.mean([path['rewards'].sum() for path in new_paths])
                reward_list.append(total_returns)

                if args.separate_videos:
                    base = os.path.splitext(args.file)[0]
                    end = '_skill_%02d.avi' % z
                    skill_filename = base + end
                    utils._save_video(new_paths, skill_filename)

        if not args.separate_videos:
            paths = [path for paths in path_list for path in paths]
Exemple #9
0
    path_list = []
    reward_list = []

    with tf.Session() as sess:
        data = joblib.load(args.file)
        policy = data['policy']
        env = data['env']
        #pdb.set_trace()
        num_skills = get_num_skills(policy, env, args.concat_type)

        #num_skills = data['policy'].observation_space.flat_dim - data['env'].spec.observation_space.flat_dim

        concat_type = args.concat_type
        with policy.deterministic(args.deterministic):
            for z in range(num_skills):
                fixed_z_policy = FixedOptionPolicy(policy, num_skills, z,
                                                   concat_type)
                new_paths = rollouts(env,
                                     fixed_z_policy,
                                     args.max_path_length,
                                     n_paths=1,
                                     render=True,
                                     render_mode='rgb_array')
                path_list.append(new_paths)
                total_returns = np.mean(
                    [path['rewards'].sum() for path in new_paths])
                reward_list.append(total_returns)

                if args.separate_videos:
                    base = os.path.splitext(args.file)[0]
                    end = '_skill_%02d.avi' % z
                    skill_filename = base + end