def _make_network(self, environment):
     if len(environment.observation_space.shape) == 3:
         return datastructures.CNNModule(environment).to(
             DefaultDevice.current())
     else:
         return datastructures.ObservationMLPModule(environment).to(
             DefaultDevice.current())
 def create_empty(self, environment, policy):
     if len(environment.observation_space.shape) == 3:
         return datastructures.CNNModule(environment).to(
             DefaultDevice.current())
     else:
         return datastructures.ObservationMLPModule(environment).to(
             DefaultDevice.current())
  def __init__(self, buffer_size, feature_size, num_neighbors):
    self.buffer = torch.zeros(buffer_size, feature_size, device=DefaultDevice.current())
    self.nearest_neighbors = torch.zeros(num_neighbors, feature_size, device=DefaultDevice.current())

    self.buffer_size = buffer_size
    self.feature_size = feature_size
    self.num_neighbors = num_neighbors

    self.num_points = 0
    self.buffer_pos = 0
Example #4
0
    def _normalize_internal_reward(self, r):
        if TspParams.current().NORMALIZE_INTERNAL_REWARD == "ALL":
            for k in r.detach():
                self.internal_reward_normalizer_all.update(k)

            std = self.internal_reward_normalizer_all.std
            if std == 0:
                return torch.sign(r)
            else:
                return r / std
        elif TspParams.current().NORMALIZE_INTERNAL_REWARD == "100":
            REWARD_WINDOW = 5 * 100
            for k in r.detach():
                self.internal_reward_normalizer_window.append(k.item())
            self.internal_reward_normalizer_window = self.internal_reward_normalizer_window[
                -REWARD_WINDOW:]
            std = torch.tensor(self.internal_reward_normalizer_window,
                               device=DefaultDevice.current()).std()
            # print(r, std, r/std)
            # print(r[0], std, r[0]/std)
            if torch.isnan(std):
                return torch.sign(r)
            else:
                return r / std
        else:
            return r.detach()
Example #5
0
    def _normalize_combined_reward(self, combined_reward, dones):
        combined_reward = combined_reward.cpu().numpy()
        all_rews = []
        for timestep_i in range(
                TspParams.current().STEPS_PER_CURIOSITY_UPDATE):
            # ", TspParams.current().PPO_FRAMES_PER_PROC)):
            step_combined_reward = combined_reward[
                timestep_i *
                TspParams.current().NUM_ROLLOUTS_PER_TRIAL:(timestep_i + 1) *
                TspParams.current().NUM_ROLLOUTS_PER_TRIAL]

            self.ret = self.ret * self.gamma + step_combined_reward

            self.ret_rms.update(self.ret)
            rews = np.clip(
                step_combined_reward /
                np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew,
                self.cliprew)

            timestep_dones = dones[timestep_i *
                                   TspParams.current().NUM_ROLLOUTS_PER_TRIAL:
                                   (timestep_i + 1) *
                                   TspParams.current().NUM_ROLLOUTS_PER_TRIAL]
            # print(len(timestep_dones), timestep_dones, len(dones))
            # print(self.ret.shape)
            self.ret[timestep_dones] = 0.
            all_rews.append(torch.tensor(rews, device=DefaultDevice.current()))

        return torch.cat(all_rews)
Example #6
0
    def extract_from_rollout_buffer(self, rollouts, cur_start_timestep):
        assert TspParams.current().REAL_BATCH_REWARD_COMPUTATION
        states = []
        prev_states = []
        actions = []
        extrinsic_rewards = []
        normalized_timesteps = []
        dones = []
        for timestep_i in range(TspParams.current().PPO_FRAMES_PER_PROC):
            for rollout in range(TspParams.current().NUM_ROLLOUTS_PER_TRIAL):
                i_episode = cur_start_timestep + timestep_i

                states.append(rollouts.obs[timestep_i + 1][rollout])
                prev_states.append(rollouts.obs[timestep_i][rollout])
                actions.append(rollouts.actions[timestep_i][rollout])
                extrinsic_rewards.append(
                    rollouts.rewards[timestep_i][rollout].detach())
                dones.append(not rollouts.masks[timestep_i][rollout])
                normalized_timesteps.append(
                    i_episode / TspParams.current().STEPS_PER_ROLLOUT)

        states_tensor = torch.stack(states)
        prev_states_tensor = torch.stack(prev_states)
        actions_tensor = self.remap_actions(self.envs, actions)
        extrinsic_rewards_tensor = torch.cat(extrinsic_rewards)
        normalized_timesteps_tensor = torch.tensor(
            normalized_timesteps,
            dtype=torch.float,
            device=DefaultDevice.current())

        return states_tensor, prev_states_tensor, actions_tensor, extrinsic_rewards_tensor, normalized_timesteps_tensor, dones
 def create_empty(self, environment, policy):
     if TspParams.current().REAL_BATCH_REWARD_COMPUTATION:
         if TspParams.current().SHARE_CURIOSITY_MODULE_IN_TRIAL:
             size = TspParams.current().STEPS_PER_CURIOSITY_UPDATE
             #TspParams.current().PPO_FRAMES_PER_PROC"]) * params["NUM_ROLLOUTS_PER_TRIAL
         else:
             size = TspParams.current().STEPS_PER_CURIOSITY_UPDATE
             #", TspParams.current().PPO_FRAMES_PER_PROC)
     else:
         size = 1
     return torch.ones(size,
                       device=DefaultDevice.current()) * self.constant_value
def one_hot(labels, num_classes):
    """Embedding labels to one-hot form.
    Source: https://discuss.pytorch.org/t/convert-int-into-one-hot-format/507/26

    Args:
      labels: (LongTensor) class labels, sized [N,].
      num_classes: (int) number of classes.

    Returns:
      (tensor) encoded labels, sized [N, #classes].
    """
    y = torch.eye(num_classes, device=DefaultDevice.current())
    return y[labels]
Example #9
0
    def _internal_reward(self,
                         state,
                         action,
                         next_state,
                         profiler=None,
                         i_episode=None):

        if state is None:
            return torch.zeros(action.shape[0],
                               dtype=torch.float,
                               device=DefaultDevice.current())

        input_values = {
            "observation_image": state,
            "action_one_hot": action,
            "new_observation_image": next_state,
        }

        assert set(input_values.keys()) == set(
            i.name for i in self.curiosity_program.input_variables), (
                "available values", set(input_values.keys()),
                "requested values",
                set(i.name for i in self.curiosity_program.input_variables))

        input_values_by_variable = {
            i: input_values[i.name]
            for i in self.curiosity_program.input_variables
        }

        r = self.curiosity_program.execute(
            input_values_by_variable,
            self.curiosity_data_structure_values,
            self.curiosity_optimizer_values,
            profiler=profiler,
            print_on_error=False,
            i_episode=i_episode).detach()

        return self._normalize_internal_reward(r)
Example #10
0
 def remap_actions(self, envs, actions):
     if not TspParams.current().CONTINUOUS_ACTION_SPACE:
         return one_hot(torch.cat(actions), self.envs.action_space.n).to(
             DefaultDevice.current())
     else:
         return torch.stack(actions).to(DefaultDevice.current())
 def std(self):
     stds = np.array([w.std for w in self.item_welfords])
     return torch.tensor(stds, device=DefaultDevice.current())
 def mean(self):
     means = np.array([w.mean for w in self.item_welfords])
     return torch.tensor(means, device=DefaultDevice.current())
 def create_empty(self, environment, policy):
     return datastructures.Ensemble([
         MLP(32 + get_action_space_size(environment.action_space), 32,
             [32, 32]) for i in range(self.NUM_MODELS)
     ], environment).to(DefaultDevice.current())
 def create_empty(self, environment, policy):
     return datastructures.Ensemble(
         [self._make_network(environment) for i in range(self.NUM_MODELS)],
         environment).to(DefaultDevice.current())
 def create_empty(self, environment, policy):
     return MLP(32, 32, [32, 32]).to(DefaultDevice.current())
 def create_empty(self, environment, policy):
     return MLP(get_action_space_size(environment.action_space), 32,
                [32, 32]).to(DefaultDevice.current())