Python BasePolicy Exemples, cs285.policies.base_policy.BasePolicy Python Exemples

Exemple #1

0

Afficher le fichier

def sample_trajectory(env,
                      policy: BasePolicy,
                      max_path_length,
                      render=False,
                      render_mode=('rgb_array')):
    # initialize env for the beginning of a new rollout
    ob = env.reset()  # HINT: should be the output of resetting the env

    # init vars
    obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
    steps = 0
    while True:

        # render image of the simulated env
        if render:
            if 'rgb_array' in render_mode:
                if hasattr(env, 'sim'):
                    image_obs.append(
                        env.sim.render(camera_name='track',
                                       height=500,
                                       width=500)[::-1])
                else:
                    image_obs.append(env.render(mode=render_mode))
            if 'human' in render_mode:
                env.render(mode=render_mode)
                time.sleep(env.model.opt.timestep)

        # use the most recent ob to decide what to do
        obs.append(ob)

        ac = policy.get_action(
            ob)  # HINT: query the policy's get_action function
        ac = ac.cpu()
        ac = ac[0].detach().numpy()
        acs.append(ac)

        # take that action and record results
        ob, rew, done, _ = env.step(ac)  # _ = info

        # record result of taking that action
        steps += 1
        next_obs.append(ob)
        rewards.append(rew)

        # TODO done end the rollout if the rollout ended
        # HINT: rollout can end due to done, or due to max_path_length
        rollout_done = False  # HINT: this is either 0 or 1
        if done or steps >= max_path_length:
            rollout_done = True
        terminals.append(rollout_done)

        if rollout_done:
            break

    # obs lags one element to next_obs
    return Path(obs, image_obs, acs, rewards, next_obs, terminals)

Exemple #2

0

Afficher le fichier

def sample_trajectory(
    env,
    policy: BasePolicy,
    max_path_length: int,
    render: bool=False,
    render_mode=('rgb_array'),
) -> PathDict:

    # initialize env for the beginning of a new rollout
    ob: np.ndarray = env.reset()

    # init vars
    obs: List[np.ndarray] = []
    acs: List[np.ndarray] = []
    rewards: List[np.ndarray] = []
    next_obs: List[np.ndarray] = []
    terminals: List[bool] = []
    image_obs: List[np.ndarray] = []
    steps = 0
    while True:

        # render image of the simulated env
        if render:
            if 'rgb_array' in render_mode:
                if hasattr(env, 'sim'):
                    image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1])
                else:
                    image_obs.append(env.render(mode=render_mode))
            if 'human' in render_mode:
                env.render(mode=render_mode)
                time.sleep(env.model.opt.timestep)

        # use the most recent ob to decide what to do
        obs.append(ob)
        ac = policy.get_action(ob)
        ac = int(ac)
        # ac = ac[0]
        acs.append(ac)

        # take that action and record results
        ob, rew, done, _ = env.step(ac)

        # record result of taking that action
        steps += 1
        next_obs.append(ob)
        rewards.append(rew)

        # end the rollout if the rollout ended
        # HINT: rollout can end due to done, or due to max_path_length
        rollout_done = bool(done) or steps >= max_path_length
        terminals.append(rollout_done)

        if rollout_done:
            break

    return Path(obs, image_obs, acs, rewards, next_obs, terminals)

Exemple #3

0

Afficher le fichier

Fichier : rl_trainer.py Projet : aryanluthra1999/homework_fall2020

    def do_relabel_with_expert(self, expert_policy: BasePolicy, paths):
        print(
            "\nRelabelling collected observations with labels from an expert policy..."
        )

        # TODO relabel collected obsevations (from our policy) with labels from an expert policy
        # HINT: query the policy (using the get_action function) with paths[i]["observation"]
        # and replace paths[i]["action"] with these expert labels
        for i in range(len(paths)):
            obs = paths[i]['observation']
            paths[i]['action'] = expert_policy.get_action(obs)

        return paths

Exemple #4

0

Afficher le fichier

def sample_trajectory(
        env,
        policy: BasePolicy,
        max_path_length: int,
        render: bool = False,
        render_mode=('rgb_array'),
) -> PathDict:

    ob = env.reset()
    obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
    steps = 0
    while True:
        if render:
            if 'rgb_array' in render_mode:
                if hasattr(env, 'sim'):
                    if 'track' in env.env.model.camera_names:
                        image_obs.append(
                            env.sim.render(camera_name='track',
                                           height=500,
                                           width=500)[::-1])
                    else:
                        image_obs.append(
                            env.sim.render(height=500, width=500)[::-1])
                else:
                    image_obs.append(env.render(mode=render_mode))
            if 'human' in render_mode:
                env.render(mode=render_mode)
                time.sleep(env.model.opt.timestep)
        obs.append(ob)
        ac = policy.get_action(ob)
        ac = ac[0]
        acs.append(ac)
        ob, rew, done, _ = env.step(ac)
        # add the observation after taking a step to next_obs
        next_obs.append(ob)
        rewards.append(rew)
        steps += 1
        # If the episode ended, the corresponding terminal value is 1
        # otherwise, it is 0
        if done or steps > max_path_length:
            terminals.append(1)
            break
        else:
            terminals.append(0)

    return Path(obs, image_obs, acs, rewards, next_obs, terminals)

Exemple #5

0

Afficher le fichier

Fichier : rl_trainer.py Projet : yzyvl/cs285-homework

    def do_relabel_with_expert(self, expert_policy: BasePolicy,
                               paths: List[PathDict]) -> List[PathDict]:
        print(
            "\nRelabelling collected observations with labels from an expert policy..."
        )

        # TODO relabel collected obsevations (from our policy) with labels from an expert policy
        # HINT: query the policy (using the get_action function) with paths[i]["observation"]
        # and replace paths[i]["action"] with these expert labels
        relabeled_paths: List[PathDict] = []
        for path in paths:
            relabeled_path = copy.deepcopy(path)
            for t, observation in enumerate(path['observation']):
                path['action'][t] = expert_policy.get_action(observation)
            relabeled_paths.append(relabeled_path)

        return paths

Exemple #6

0

Afficher le fichier

    def collect_loss(
            self,
            actions,
            labels,
            criterion,
            expert_policy: BasePolicy,
    ):
        print("Start collecting path and expert actions...")
        total_envsteps = 0
        emp_action = []
        done = False
        reward = 0
        loss = 0
        for _ in range(self.params['batch_size']):
            if total_envsteps == 0:
                obs = self.env.reset()
            else:
                action_to_step = map_utils.restore_continuous_action(emp_action, self.params['bins'], -5, 5)
                obs, reward, done, info = self.env.step(action_to_step)

            obs_tensor = torch.tensor(obs, device=ptu.device, dtype=torch.float64)
            emp_action = self.gm_net(torch.flatten(obs_tensor).float())
            expert_labels = np.squeeze(expert_policy.get_action(obs))
            discretized_labels = torch.tensor(map_utils.discretize_action(expert_labels, 3, self.params['bins'], -5, 5),
                                                  device=ptu.device, dtype=torch.float64)

            # print debug logs
            # print("step's reward is ", reward)
            # print("obs is", obs)
            print("expert_labels is", expert_labels)
            # print("action is", map_utils.restore_continuous_action(emp_action, self.params['bins'], -1, 1))
            # print("discrete labels is", discretized_labels)
            print("emp_action is", map_utils.restore_continuous_action(emp_action, self.params['bins'], -5, 5))
            # print("discrete action is", emp_action)

            loss += criterion(emp_action, discretized_labels)

            if done:
                print("Ending current training iteration")
                break
            else:
                total_envsteps += 1

        return loss

Exemple #7

0

Afficher le fichier

Fichier : emp_trainer.py Projet : LCyson/cs285-homework

    def collect_trajactory(
        self,
        dataset,
        expert_policy: BasePolicy,
    ):
        print("Start collecting path and expert actions...")
        total_envsteps = 0
        emp_action = None
        done = False
        rewards = []
        reward = 0
        for _ in range(self.params['batch_size']):
            if total_envsteps == 0:
                obs = self.env.reset()
            else:
                action_to_step = map_utils.restore_continuous_action_from_binary(
                    emp_action, self.params['bins'], -4, 4)
                obs, reward, done, info = self.env.step(action_to_step)

            obs_tensor = torch.tensor(obs,
                                      device=ptu.device,
                                      dtype=torch.float64)
            emp_action = self.gm_net(torch.flatten(obs_tensor).float())
            expert_labels = np.squeeze(expert_policy.get_action(obs))
            dataset["expert_action"].append(expert_labels)
            dataset["observation"].append(obs)
            rewards.append(reward)

            # print("expert action", expert_labels)
            # print("emp action", map_utils.restore_continuous_action_from_binary(emp_action, self.params['bins'], -4, 4))
            # print("step's reward is ", reward)

            if done:
                print("Ending current training iteration")
                break
            else:
                total_envsteps += 1

        print("obs", len(dataset["observation"]))
        print("expert_action", len(dataset["expert_action"]))
        return dataset, rewards