Example #1
0
def evaluate_policy(e,
                    policy,
                    learned_model,
                    noise_level=0.0,
                    real_step=False,
                    num_episodes=10,
                    visualize=False):
    # rollout the policy on env and record performance
    paths = []
    for ep in range(num_episodes):
        e.reset()
        observations = []
        actions = []
        rewards = []
        env_infos = []
        t = 0
        done = False
        while t < e.horizon and done is False:
            o = e.get_obs()
            ifo = e.get_env_infos()
            a = policy.get_action(o)
            if type(a) == list:
                a = a[1]['evaluation']
            if noise_level > 0.0:
                a = a + e.env.env.np_random.uniform(
                    low=-noise_level, high=noise_level, size=a.shape[0])
            if real_step is False:
                next_s = learned_model.predict(o, a)
                r = 0.0  # temporarily
                e.env.env.set_fitted_state(next_s)
            else:
                next_o, r, done, ifo2 = e.step(a)
                ifo = ifo2 if ifo == {} else ifo
            if visualize:
                e.render()

            t = t + 1
            observations.append(o)
            actions.append(a)
            rewards.append(r)
            env_infos.append(ifo)

        path = dict(observations=np.array(observations),
                    actions=np.array(actions),
                    rewards=np.array(rewards),
                    env_infos=tensor_utils.stack_tensor_dict_list(env_infos))
        if real_step is False:
            e.env.env.compute_path_rewards(path)
            try:
                path = e.env.env.truncate_paths([path])[0]
            except:
                pass
        paths.append(path)
        if visualize:
            print("episode score = %f " % np.sum(path['rewards']))
    return paths
Example #2
0
def sample_paths(
    num_traj,
    env,
    policy,  # mpc policy on fitted model
    horizon=1e6,
    eval_mode=True,
    base_seed=None,
    noise_level=0.1,
):

    # get the correct env behavior
    if type(env) == str:
        env = GymEnv(env)
    elif isinstance(env, GymEnv):
        env = env
    elif callable(env):
        env = env()
    else:
        print("Unsupported environment format")
        raise AttributeError
    if base_seed is not None:
        env.set_seed(base_seed)
    horizon = min(horizon, env.horizon)
    paths = []
    for ep in range(num_traj):
        env.reset()
        observations = []
        actions = []
        rewards = []
        env_infos = []
        t = 0
        done = False
        while t < horizon and done is False:
            obs = env.get_obs()
            ifo = env.get_env_infos()
            act = policy.get_action(obs)
            if eval_mode is False and type(act) != list:
                act = act + np.random.uniform(
                    low=-noise_level, high=noise_level, size=act.shape[0])
            if type(act) == list:
                act = act[0] if eval_mode is False else act[1]['evaluation']
            next_obs, reward, done, _ = env.step(act)
            t = t + 1
            observations.append(obs)
            actions.append(act)
            rewards.append(reward)
            env_infos.append(ifo)
        path = dict(observations=np.array(observations),
                    actions=np.array(actions),
                    rewards=np.array(rewards),
                    terminated=done,
                    env_infos=tensor_utils.stack_tensor_dict_list(env_infos))
        paths.append(path)
    return paths
Example #3
0
def do_rollout(num_traj,
               env,
               policy,
               eval_mode=False,
               horizon=1e6,
               base_seed=None,
               env_kwargs=None,
               init_states_per_cpu=None):
    """
    :param num_traj:    number of trajectories (int)
    :param env:         environment (env class, str with env_name, or factory function)
    :param policy:      policy to use for action selection
    :param eval_mode:   use evaluation mode for action computation (bool)
    :param horizon:     max horizon length for rollout (<= env.horizon)
    :param base_seed:   base seed for rollouts (int)
    :param env_kwargs:  dictionary with parameters, will be passed to env generator
    :param init_states_per_cpu: list of init states to initialize from for fixed evaluation
    :return:
    """

    # get the correct env behavior
    if type(env) == str:
        if isinstance(env_kwargs, dict):
            env = GymEnv(env, **env_kwargs)
        else:
            env = GymEnv(env)
    elif isinstance(env, GymEnv):
        env = env
    elif callable(env):
        env = env(**env_kwargs)
    else:
        print("Unsupported environment format")
        raise AttributeError

    if base_seed is not None:
        env.set_seed(base_seed)
        np.random.seed(base_seed)
    else:
        np.random.seed()
    horizon = min(horizon, env.horizon)
    paths = []

    for ep in range(num_traj):
        # seeding
        if base_seed is not None:
            seed = base_seed + ep
            env.set_seed(seed)
            np.random.seed(seed)

        observations = []
        actions = []
        rewards = []
        agent_infos = []
        env_infos = []

        o = env.reset()
        if init_states_per_cpu is not None:
            o = env.set_env_state(init_states_per_cpu[ep])
            assert o is not None, 'set_env_state of env ' + env.env_id + ' returns None, should return observation'
        done = False
        t = 0

        while t < horizon and done != True:
            a, agent_info = policy.get_action(o)
            if eval_mode:
                a = agent_info['evaluation']
            env_info_base = env.get_env_infos()
            next_o, r, done, env_info_step = env.step(a)
            # below is important to ensure correct env_infos for the timestep
            env_info = env_info_step if env_info_base == {} else env_info_base
            observations.append(o)
            actions.append(a)
            rewards.append(r)
            agent_infos.append(agent_info)
            env_infos.append(env_info)
            o = next_o
            t += 1

        path = dict(
            observations=np.array(observations),
            actions=np.array(actions),
            rewards=np.array(rewards),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
            terminated=done)
        paths.append(path)

    del (env)
    return paths
Example #4
0
def main(include, env_name, path_file, loop, render, save):
    # get env
    if include is not "":
        exec("import " + include)
    env = GymEnv(env_name)

    # load paths
    paths = pickle.load(open(path_file, 'rb'))

    # playback paths
    pbk_paths = []
    for i_loop in range(loop):
        for path in paths:
            obs = []
            act = []
            rewards = []
            env_infos = []
            states = []

            # initialize paths
            if "state" in path.keys():
                env.reset(init_state=path["state"][0])
            else:
                env.reset()

            # playback input path
            if render:
                env.env.env.mujoco_render_frames = True
            o = env.get_obs()
            for i_step in range(path['actions'].shape[0]):
                a = path['actions'][i_step]
                s = env.get_env_state()
                onext, r, d, info = env.step(a)  # t = t+1
                if render:
                    env.render()
                obs.append(o)
                o = onext
                act.append(a)
                rewards.append(r)
                env_infos.append(info)
                states.append(env.get_env_state())
            if render:
                env.env.env.mujoco_render_frames = True

            # create output paths
            pbk_path = dict(
                observations=np.array(obs),
                actions=np.array(act),
                rewards=np.array(rewards),
                env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
                states=states)
            pbk_paths.append(pbk_path)
        print("Finished playback loop:{}".format(i_loop))

    # save output paths
    if save:
        pbk_file_name = path_file[:path_file.rfind('.')] + "_playback.pickle"
        pickle.dump(pbk_paths, open(pbk_file_name, 'wb'))
        print("Saved: " + pbk_file_name)

    return pbk_paths
Example #5
0
def do_evaluation_rollout(N,
                          policy,
                          T=1e6,
                          env=None,
                          env_name=None,
                          pegasus_seed=None):
    """
    params:
    N               : number of trajectories
    policy          : policy to be used to sample the data
    T               : maximum length of trajectory
    env             : env object to sample from
    env_name        : name of env to be sampled from
                      (one of env or env_name must be specified)
    pegasus_seed    : seed for environment (numpy speed must be set externally)
    """

    if env_name is None and env is None:
        print("No environment specified! Error will be raised")
    if env is None: env = get_environment(env_name)
    if pegasus_seed is not None:
        try:
            env.env._seed(pegasus_seed)
        except AttributeError as e:
            env.env.seed(pegasus_seed)
    T = min(T, env.horizon)

    # print("####### Worker started #######")

    paths = []

    for ep in range(N):

        # Set pegasus seed if asked
        if pegasus_seed is not None:
            seed = pegasus_seed + ep
            try:
                env.env._seed(seed)
            except AttributeError as e:
                env.env.seed(seed)
            np.random.seed(seed)
        else:
            np.random.seed()

        observations = []
        actions = []
        rewards = []
        agent_infos = []
        env_infos = []

        o = env.reset()
        done = False
        t = 0

        while t < T and done != True:
            _, agent_info = policy.get_action(o)
            a = agent_info['evaluation']
            next_o, r, done, env_info = env.step(a)
            # observations.append(o.ravel())
            observations.append(o)
            actions.append(a)
            rewards.append(r)
            agent_infos.append(agent_info)
            env_infos.append(env_info)
            o = next_o
            t += 1

        path = dict(
            observations=np.array(observations),
            actions=np.array(actions),
            rewards=np.array(rewards),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
            terminated=done)

        paths.append(path)

    # print("====== Worker finished ======")

    return paths
Example #6
0
    def trajectory_generator(self, beta, dagger_ep):
        if self.env_name is None and self.env is None:
            print("No environment specified! Error will be raised")
        if self.env is None: self.env = get_environment(self.env_name)

        T = self.env.horizon

        paths = []
        print('Generating trajectories')
        for ep in tqdm(range(self.num_traj_gen)):
            if self.seed is not None:
                seed = self.seed + ep + dagger_ep * self.num_traj_gen
                self.env.env.env._seed(seed)
                np.random.seed(seed)
            else:
                np.random.seed()
            observations = []
            actions = []
            rewards = []
            agent_infos = []
            env_infos = []
            path_image_pixels = []
            all_robot_info = []

            o = self.env.reset()
            robot_info = None
            if self.has_robot_info:
                o, env_info = self.env.reset()
                robot_info = env_info['robot_info']
            done = False
            t = 0

            while t < T and done != True:
                r = np.random.random()
                image_pix = self.env.get_pixels(frame_size=FRAME_SIZE,
                                                camera_name=self.camera_name,
                                                device_id=self.device_id)

                a_expert, agent_info_expert = self.expert_policy.get_action(o)

                img = image_pix
                prev_img = image_pix
                prev_prev_img = image_pix
                if t > 0:
                    prev_img = path_image_pixels[t - 1]
                if t > 1:
                    prev_prev_img = path_image_pixels[t - 2]

                prev_prev_img = np.expand_dims(prev_prev_img, axis=0)
                prev_img = np.expand_dims(prev_img, axis=0)
                img = np.expand_dims(img, axis=0)

                o_img = np.concatenate((prev_prev_img, prev_img, img), axis=0)
                a_viz, agent_info_viz = self.viz_policy.get_action(
                    o_img,
                    use_seq=self.use_seq,
                    use_cuda=self.use_cuda,
                    robot_info=robot_info)

                if r <= beta:
                    a = agent_info_expert['evaluation']
                    agent_info = agent_info_expert
                else:
                    a = a_viz
                    agent_info = agent_info_viz

                next_o, r, done, env_info = self.env.step(a)
                observations.append(o)
                actions.append(agent_info_expert['evaluation'])
                rewards.append(r)
                agent_infos.append(agent_info)
                env_infos.append(env_info)
                path_image_pixels.append(image_pix)
                if self.has_robot_info:
                    all_robot_info.append(robot_info)
                    robot_info = env_info['robot_info']

                o = next_o
                t += 1

            path = dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
                agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
                env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
                terminated=done,
                image_pixels=np.array(path_image_pixels))
            if self.has_robot_info:
                path['robot_info'] = np.array(all_robot_info)

            paths.append(path)
        return paths
Example #7
0
    def trajectory_generator(self, beta, dagger_ep):
        if self.env_name is None and self.env is None:
            print("No environment specified! Error will be raised")
        if self.env is None: self.env = get_environment(self.env_name)

        T = self.env.horizon

        paths = []
        print('Generating trajectories')
        for ep in tqdm(range(self.num_traj_gen)):
            if self.seed is not None:
                seed = self.seed + ep + dagger_ep * self.num_traj_gen
                self.env.env.env._seed(seed)
                np.random.seed(seed)
            else:
                np.random.seed()
            observations = []
            actions = []
            rewards = []
            agent_infos = []
            env_infos = []
            path_image_pixels = []
            all_robot_info = []

            o = self.env.reset()
            robot_info = None
            if self.has_robot_info:
                o, env_info = self.env.reset()
                robot_info = env_info['robot_info']
            done = False
            t = 0

            while t < T and done != True:
                r = np.random.random()
                image_pix = self.env.get_pixels(frame_size=FRAME_SIZE, camera_name=self.camera_name,
                                                device_id=self.device_id)

                a_expert, agent_info_expert = self.expert_policy.get_action(o)

                img = image_pix
                prev_img = image_pix
                prev_prev_img = image_pix
                if t > 0:
                    prev_img = path_image_pixels[t - 1]
                if t > 1:
                    prev_prev_img = path_image_pixels[t - 2]

                prev_prev_img = np.expand_dims(prev_prev_img, axis=0)
                prev_img = np.expand_dims(prev_img, axis=0)
                img = np.expand_dims(img, axis=0)

                o_img = np.concatenate((prev_prev_img, prev_img, img), axis=0)
                a_viz, agent_info_viz = self.viz_policy.get_action(o_img, use_seq=self.use_seq, use_cuda=self.use_cuda,
                                                                   robot_info=robot_info)

                if r <= beta:
                    a = agent_info_expert['evaluation']
                    agent_info = agent_info_expert
                else:
                    a = a_viz
                    agent_info = agent_info_viz

                next_o, r, done, env_info = self.env.step(a)
                observations.append(o)
                actions.append(agent_info_expert['evaluation'])
                rewards.append(r)
                agent_infos.append(agent_info)
                env_infos.append(env_info)
                path_image_pixels.append(image_pix)
                if self.has_robot_info:
                    all_robot_info.append(robot_info)
                    robot_info = env_info['robot_info']

                o = next_o
                t += 1

            path = dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
                agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
                env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
                terminated=done,
                image_pixels=np.array(path_image_pixels)
            )
            if self.has_robot_info:
                path['robot_info'] = np.array(all_robot_info)

            paths.append(path)
        return paths