def separate_out_data_types(self, experiences): """Puts the sampled experience into the correct format for a PyTorch neural network""" states = [ e.value[self.indexes_in_node_value_tuple["state"]] for e in experiences if e is not None ] states = unwrap_state(states, device=self.device) actions = torch.from_numpy( np.vstack([ e.value[self.indexes_in_node_value_tuple["action"]] for e in experiences if e is not None ])).long().to(self.device) rewards = torch.from_numpy( np.vstack([ e.value[self.indexes_in_node_value_tuple["reward"]] for e in experiences if e is not None ])).float().to(self.device) next_states = [ e.value[self.indexes_in_node_value_tuple["next_state"]] for e in experiences if e is not None ] next_states = unwrap_state(next_states, device=self.device) dones = torch.from_numpy( np.vstack([ int(e.value[self.indexes_in_node_value_tuple["done"]]) for e in experiences if e is not None ])).float().to(self.device) return states, actions, rewards, next_states, dones
def pick_action(self, state=None, eval_ep=False, top_k=None): if top_k is None: top_k = self.k if state is None: state = self.state if self.global_step_number < self.hyperparameters[ "min_steps_before_learning"] and not eval_ep: return self.sample_from_action_space(num_items=self.metrics_k) with torch.no_grad(): if not eval_ep: state = unwrap_state(state, device=self.device) state, targets = self.create_state_vector(state) self.eval() def action_fn(): return self.state_to_action(state, eval_ep, top_k) action = self.exploration_strategy.perturb_action_for_exploration_purposes( { "action_fn": action_fn, "turn_off_exploration": eval_ep, "episode_number": self.episode_number, "sample_shape": top_k }) self.to_train() if isinstance(action, torch.Tensor): action = action.cpu().detach().numpy() return action
def pick_action_and_get_log_probabilities(self, state=None): """Picks actions and then calculates the log probabilities of the actions it picked given the policy""" if state is None: state = self.state state = unwrap_state(state, device=self.device) state, current_targets = self.create_state_vector(state) action_logits, hidden_state = self.policy(state) beta_logits = self.beta(hidden_state.detach(), current_targets) with torch.no_grad(): beta_probs = beta_logits.softmax(dim=-1) # need the prob_min because can not be 0 and large logits lead to tiny softmax beta_samples = torch.multinomial(beta_probs + PROB_MIN, self.k) beta_prob = beta_probs.gather(1, beta_samples) ppo_weight = None action_log_prob = action_logits.log_softmax(dim=-1) if self.use_ppo: with torch.no_grad(): curr_samples = torch.multinomial( action_log_prob.exp() + PROB_MIN, self.k) curr_prob = action_log_prob.gather(1, curr_samples) action_logits_last, _ = self.last_policy(state) action_prob_last = action_logits_last.softmax(dim=-1).gather( 1, curr_samples) ppo_weight = curr_prob.exp() / (action_prob_last + 1e-8) action_prob = action_log_prob.gather(1, beta_samples) correction = torch.clamp_max_( torch.exp(action_prob) / beta_prob, CLIPPING_VALUE).detach() return beta_samples.cpu().detach().numpy( ), action_prob, correction, ppo_weight
def separate_out_data_types(self, experiences): """Puts the sampled experience into the correct format for a PyTorch neural network""" states = [e.state for e in experiences if e is not None] states = unwrap_state(states, device=self.device) actions = torch.from_numpy( np.vstack([e.action for e in experiences if e is not None])).long().to(self.device) rewards = torch.from_numpy( np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device) next_states = [e.next_state for e in experiences if e is not None] next_states = unwrap_state(next_states, device=self.device) dones = torch.from_numpy( np.vstack([int(e.done) for e in experiences if e is not None])).float().to(self.device) return states, actions, rewards, next_states, dones
def pick_action(self, state=None, eval_ep=False, top_k=None): if top_k is None: top_k = self.k if state is None: state = self.state if self.global_step_number < self.hyperparameters[ "min_steps_before_learning"] and not eval_ep: return self.sample_from_action_space(num_items=self.metrics_k) with torch.no_grad(): if not eval_ep: state = unwrap_state(state, device=self.device) state, targets = self.create_state_vector(state) action = self.state_to_action(state, eval_ep, top_k) return action
def pick_action_and_log_probs(self, state=None): if state is None: state = self.state state = unwrap_state(state, device=self.device) state, targets = self.create_state_vector(state) if self.hyperparameters["batch_rl"]: # Get the log probs for the target policy actions, action_log_probs = self.agent.log_probs_for_actions(state, targets) # Update the off-policy network for IS weights (behavior policy approximation) beta_logits = self.off_policy_agent(state.detach()) beta_log_probs = beta_logits.log_softmax(dim=-1) beta_log_probs = beta_log_probs[torch.arange(beta_log_probs.size(0)), targets] self.update_off_policy_agent(beta_log_probs) is_weights = torch.clamp_max_(torch.exp(action_log_probs) / torch.exp(beta_log_probs), CLIPPING_VALUE).detach() return actions, action_log_probs, is_weights action_trajectory, action_log_probs = self.agent(state, deterministic=False) actions = self.action_trajectory_to_action(action_trajectory) # if self.masking_enabled: # actions, mask = self.mask_action_output(actions) return actions, action_log_probs, None
def get_batch_rl_actions(self): return unwrap_state(self.state, device="cpu").targets
def pretrain(self): """ Pretraining the actor in a supervised-fashion """ batch_size = self.hyperparameters["batch_size"] buffer = Buffer(batch_size, int(1e6)) logging.info("Filling the train_set") env = deepcopy(self.environment) for e in env.envs: e.num_repeats = 1 while 1: obs = env.reset() if obs is None: break done = False dummy_actions = np.zeros((len(obs), self.k), dtype=np.int) while not done: # put into buffer and sample for a more iid distribution buffer.append(obs) if obs.shape[0] != dummy_actions.shape[0]: dummy_actions = dummy_actions[:obs.shape[0]] obs, _, done, _ = env.step(dummy_actions) del env log_interval = 50 num_steps = self.hyperparameters.get("pretrain_steps") eval_interval = self.hyperparameters.get("pretrain_eval_steps") pretrain_fn = self.pretrain_state_from_batch if self.hyperparameters.get( "state-only-pretrain") else self.pretrain_from_batch eval_fn = self.pretrain_eval_fn if self.hyperparameters.get( "state-only-pretrain") else self.get_eval_action trailing_loss_values = deque(maxlen=10) total = min(len(buffer) // batch_size, num_steps) + 1 with tqdm(total=total) as t: for i, state_batch in zip(range(total), buffer): state = unwrap_state(state_batch, device=self.device) error = pretrain_fn(state) trailing_loss_values.append(error) episode_loss = np.mean(list(trailing_loss_values)) t.set_postfix(loss=episode_loss) self.log_scalar("pretrain/loss", episode_loss, global_step=i, interval=log_interval) if i % eval_interval == 0 and i: self.post_pretrain_hook() reward = self.evaluate(scope="pretrain", global_step=i, eval_fn=eval_fn) if self.hyperparameters.get("state-only-pretrain"): d = { "rnn": self.state_agg.encoder.state_dict(), "optimizer": self.state_optimizer.state_dict(), "embedding": self.embedding.state_dict() } self.pretrain_model_saver.save_model(d, i, reward, scope="valid") else: self.locally_save_policy( scope="valid", reward=reward, step=i, model_saver=self.pretrain_model_saver) t.update()