def _make_network(self, environment): if len(environment.observation_space.shape) == 3: return datastructures.CNNModule(environment).to( DefaultDevice.current()) else: return datastructures.ObservationMLPModule(environment).to( DefaultDevice.current())
def create_empty(self, environment, policy): if len(environment.observation_space.shape) == 3: return datastructures.CNNModule(environment).to( DefaultDevice.current()) else: return datastructures.ObservationMLPModule(environment).to( DefaultDevice.current())
def __init__(self, buffer_size, feature_size, num_neighbors): self.buffer = torch.zeros(buffer_size, feature_size, device=DefaultDevice.current()) self.nearest_neighbors = torch.zeros(num_neighbors, feature_size, device=DefaultDevice.current()) self.buffer_size = buffer_size self.feature_size = feature_size self.num_neighbors = num_neighbors self.num_points = 0 self.buffer_pos = 0
def _normalize_internal_reward(self, r): if TspParams.current().NORMALIZE_INTERNAL_REWARD == "ALL": for k in r.detach(): self.internal_reward_normalizer_all.update(k) std = self.internal_reward_normalizer_all.std if std == 0: return torch.sign(r) else: return r / std elif TspParams.current().NORMALIZE_INTERNAL_REWARD == "100": REWARD_WINDOW = 5 * 100 for k in r.detach(): self.internal_reward_normalizer_window.append(k.item()) self.internal_reward_normalizer_window = self.internal_reward_normalizer_window[ -REWARD_WINDOW:] std = torch.tensor(self.internal_reward_normalizer_window, device=DefaultDevice.current()).std() # print(r, std, r/std) # print(r[0], std, r[0]/std) if torch.isnan(std): return torch.sign(r) else: return r / std else: return r.detach()
def _normalize_combined_reward(self, combined_reward, dones): combined_reward = combined_reward.cpu().numpy() all_rews = [] for timestep_i in range( TspParams.current().STEPS_PER_CURIOSITY_UPDATE): # ", TspParams.current().PPO_FRAMES_PER_PROC)): step_combined_reward = combined_reward[ timestep_i * TspParams.current().NUM_ROLLOUTS_PER_TRIAL:(timestep_i + 1) * TspParams.current().NUM_ROLLOUTS_PER_TRIAL] self.ret = self.ret * self.gamma + step_combined_reward self.ret_rms.update(self.ret) rews = np.clip( step_combined_reward / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) timestep_dones = dones[timestep_i * TspParams.current().NUM_ROLLOUTS_PER_TRIAL: (timestep_i + 1) * TspParams.current().NUM_ROLLOUTS_PER_TRIAL] # print(len(timestep_dones), timestep_dones, len(dones)) # print(self.ret.shape) self.ret[timestep_dones] = 0. all_rews.append(torch.tensor(rews, device=DefaultDevice.current())) return torch.cat(all_rews)
def extract_from_rollout_buffer(self, rollouts, cur_start_timestep): assert TspParams.current().REAL_BATCH_REWARD_COMPUTATION states = [] prev_states = [] actions = [] extrinsic_rewards = [] normalized_timesteps = [] dones = [] for timestep_i in range(TspParams.current().PPO_FRAMES_PER_PROC): for rollout in range(TspParams.current().NUM_ROLLOUTS_PER_TRIAL): i_episode = cur_start_timestep + timestep_i states.append(rollouts.obs[timestep_i + 1][rollout]) prev_states.append(rollouts.obs[timestep_i][rollout]) actions.append(rollouts.actions[timestep_i][rollout]) extrinsic_rewards.append( rollouts.rewards[timestep_i][rollout].detach()) dones.append(not rollouts.masks[timestep_i][rollout]) normalized_timesteps.append( i_episode / TspParams.current().STEPS_PER_ROLLOUT) states_tensor = torch.stack(states) prev_states_tensor = torch.stack(prev_states) actions_tensor = self.remap_actions(self.envs, actions) extrinsic_rewards_tensor = torch.cat(extrinsic_rewards) normalized_timesteps_tensor = torch.tensor( normalized_timesteps, dtype=torch.float, device=DefaultDevice.current()) return states_tensor, prev_states_tensor, actions_tensor, extrinsic_rewards_tensor, normalized_timesteps_tensor, dones
def create_empty(self, environment, policy): if TspParams.current().REAL_BATCH_REWARD_COMPUTATION: if TspParams.current().SHARE_CURIOSITY_MODULE_IN_TRIAL: size = TspParams.current().STEPS_PER_CURIOSITY_UPDATE #TspParams.current().PPO_FRAMES_PER_PROC"]) * params["NUM_ROLLOUTS_PER_TRIAL else: size = TspParams.current().STEPS_PER_CURIOSITY_UPDATE #", TspParams.current().PPO_FRAMES_PER_PROC) else: size = 1 return torch.ones(size, device=DefaultDevice.current()) * self.constant_value
def one_hot(labels, num_classes): """Embedding labels to one-hot form. Source: https://discuss.pytorch.org/t/convert-int-into-one-hot-format/507/26 Args: labels: (LongTensor) class labels, sized [N,]. num_classes: (int) number of classes. Returns: (tensor) encoded labels, sized [N, #classes]. """ y = torch.eye(num_classes, device=DefaultDevice.current()) return y[labels]
def _internal_reward(self, state, action, next_state, profiler=None, i_episode=None): if state is None: return torch.zeros(action.shape[0], dtype=torch.float, device=DefaultDevice.current()) input_values = { "observation_image": state, "action_one_hot": action, "new_observation_image": next_state, } assert set(input_values.keys()) == set( i.name for i in self.curiosity_program.input_variables), ( "available values", set(input_values.keys()), "requested values", set(i.name for i in self.curiosity_program.input_variables)) input_values_by_variable = { i: input_values[i.name] for i in self.curiosity_program.input_variables } r = self.curiosity_program.execute( input_values_by_variable, self.curiosity_data_structure_values, self.curiosity_optimizer_values, profiler=profiler, print_on_error=False, i_episode=i_episode).detach() return self._normalize_internal_reward(r)
def remap_actions(self, envs, actions): if not TspParams.current().CONTINUOUS_ACTION_SPACE: return one_hot(torch.cat(actions), self.envs.action_space.n).to( DefaultDevice.current()) else: return torch.stack(actions).to(DefaultDevice.current())
def std(self): stds = np.array([w.std for w in self.item_welfords]) return torch.tensor(stds, device=DefaultDevice.current())
def mean(self): means = np.array([w.mean for w in self.item_welfords]) return torch.tensor(means, device=DefaultDevice.current())
def create_empty(self, environment, policy): return datastructures.Ensemble([ MLP(32 + get_action_space_size(environment.action_space), 32, [32, 32]) for i in range(self.NUM_MODELS) ], environment).to(DefaultDevice.current())
def create_empty(self, environment, policy): return datastructures.Ensemble( [self._make_network(environment) for i in range(self.NUM_MODELS)], environment).to(DefaultDevice.current())
def create_empty(self, environment, policy): return MLP(32, 32, [32, 32]).to(DefaultDevice.current())
def create_empty(self, environment, policy): return MLP(get_action_space_size(environment.action_space), 32, [32, 32]).to(DefaultDevice.current())