def separate_out_data_types(self, experiences):
        """Puts the sampled experience into the correct format for a PyTorch neural network"""

        states = [
            e.value[self.indexes_in_node_value_tuple["state"]]
            for e in experiences if e is not None
        ]

        states = unwrap_state(states, device=self.device)
        actions = torch.from_numpy(
            np.vstack([
                e.value[self.indexes_in_node_value_tuple["action"]]
                for e in experiences if e is not None
            ])).long().to(self.device)
        rewards = torch.from_numpy(
            np.vstack([
                e.value[self.indexes_in_node_value_tuple["reward"]]
                for e in experiences if e is not None
            ])).float().to(self.device)

        next_states = [
            e.value[self.indexes_in_node_value_tuple["next_state"]]
            for e in experiences if e is not None
        ]
        next_states = unwrap_state(next_states, device=self.device)

        dones = torch.from_numpy(
            np.vstack([
                int(e.value[self.indexes_in_node_value_tuple["done"]])
                for e in experiences if e is not None
            ])).float().to(self.device)

        return states, actions, rewards, next_states, dones
Beispiel #2
0
    def pick_action(self, state=None, eval_ep=False, top_k=None):
        if top_k is None:
            top_k = self.k
        if state is None:
            state = self.state
        if self.global_step_number < self.hyperparameters[
                "min_steps_before_learning"] and not eval_ep:
            return self.sample_from_action_space(num_items=self.metrics_k)
        with torch.no_grad():
            if not eval_ep:
                state = unwrap_state(state, device=self.device)
            state, targets = self.create_state_vector(state)
            self.eval()

            def action_fn():
                return self.state_to_action(state, eval_ep, top_k)

            action = self.exploration_strategy.perturb_action_for_exploration_purposes(
                {
                    "action_fn": action_fn,
                    "turn_off_exploration": eval_ep,
                    "episode_number": self.episode_number,
                    "sample_shape": top_k
                })
            self.to_train()
        if isinstance(action, torch.Tensor):
            action = action.cpu().detach().numpy()
        return action
Beispiel #3
0
    def pick_action_and_get_log_probabilities(self, state=None):
        """Picks actions and then calculates the log probabilities of the actions it picked given the policy"""
        if state is None:
            state = self.state
        state = unwrap_state(state, device=self.device)
        state, current_targets = self.create_state_vector(state)

        action_logits, hidden_state = self.policy(state)
        beta_logits = self.beta(hidden_state.detach(), current_targets)

        with torch.no_grad():
            beta_probs = beta_logits.softmax(dim=-1)
            # need the prob_min because can not be 0 and large logits lead to tiny softmax
            beta_samples = torch.multinomial(beta_probs + PROB_MIN, self.k)
            beta_prob = beta_probs.gather(1, beta_samples)

        ppo_weight = None
        action_log_prob = action_logits.log_softmax(dim=-1)
        if self.use_ppo:
            with torch.no_grad():
                curr_samples = torch.multinomial(
                    action_log_prob.exp() + PROB_MIN, self.k)
                curr_prob = action_log_prob.gather(1, curr_samples)

                action_logits_last, _ = self.last_policy(state)
                action_prob_last = action_logits_last.softmax(dim=-1).gather(
                    1, curr_samples)

                ppo_weight = curr_prob.exp() / (action_prob_last + 1e-8)

        action_prob = action_log_prob.gather(1, beta_samples)
        correction = torch.clamp_max_(
            torch.exp(action_prob) / beta_prob, CLIPPING_VALUE).detach()
        return beta_samples.cpu().detach().numpy(
        ), action_prob, correction, ppo_weight
    def separate_out_data_types(self, experiences):
        """Puts the sampled experience into the correct format for a PyTorch neural network"""

        states = [e.state for e in experiences if e is not None]

        states = unwrap_state(states, device=self.device)
        actions = torch.from_numpy(
            np.vstack([e.action for e in experiences
                       if e is not None])).long().to(self.device)
        rewards = torch.from_numpy(
            np.vstack([e.reward for e in experiences
                       if e is not None])).float().to(self.device)

        next_states = [e.next_state for e in experiences if e is not None]
        next_states = unwrap_state(next_states, device=self.device)

        dones = torch.from_numpy(
            np.vstack([int(e.done) for e in experiences
                       if e is not None])).float().to(self.device)

        return states, actions, rewards, next_states, dones
Beispiel #5
0
 def pick_action(self, state=None, eval_ep=False, top_k=None):
     if top_k is None:
         top_k = self.k
     if state is None:
         state = self.state
     if self.global_step_number < self.hyperparameters[
             "min_steps_before_learning"] and not eval_ep:
         return self.sample_from_action_space(num_items=self.metrics_k)
     with torch.no_grad():
         if not eval_ep:
             state = unwrap_state(state, device=self.device)
         state, targets = self.create_state_vector(state)
         action = self.state_to_action(state, eval_ep, top_k)
     return action
Beispiel #6
0
    def pick_action_and_log_probs(self, state=None):
        if state is None:
            state = self.state
        state = unwrap_state(state, device=self.device)
        state, targets = self.create_state_vector(state)
        if self.hyperparameters["batch_rl"]:
            # Get the log probs for the target policy
            actions, action_log_probs = self.agent.log_probs_for_actions(state, targets)

            # Update the off-policy network for IS weights (behavior policy approximation)
            beta_logits = self.off_policy_agent(state.detach())
            beta_log_probs = beta_logits.log_softmax(dim=-1)
            beta_log_probs = beta_log_probs[torch.arange(beta_log_probs.size(0)), targets]
            self.update_off_policy_agent(beta_log_probs)

            is_weights = torch.clamp_max_(torch.exp(action_log_probs) / torch.exp(beta_log_probs),
                                          CLIPPING_VALUE).detach()
            return actions, action_log_probs, is_weights

        action_trajectory, action_log_probs = self.agent(state, deterministic=False)
        actions = self.action_trajectory_to_action(action_trajectory)
        # if self.masking_enabled:
        #     actions, mask = self.mask_action_output(actions)
        return actions, action_log_probs, None
Beispiel #7
0
 def get_batch_rl_actions(self):
     return unwrap_state(self.state, device="cpu").targets
Beispiel #8
0
    def pretrain(self):
        """
        Pretraining the actor in a supervised-fashion
        """
        batch_size = self.hyperparameters["batch_size"]
        buffer = Buffer(batch_size, int(1e6))

        logging.info("Filling the train_set")
        env = deepcopy(self.environment)
        for e in env.envs:
            e.num_repeats = 1
        while 1:
            obs = env.reset()
            if obs is None:
                break
            done = False
            dummy_actions = np.zeros((len(obs), self.k), dtype=np.int)
            while not done:
                # put into buffer and sample for a more iid distribution
                buffer.append(obs)
                if obs.shape[0] != dummy_actions.shape[0]:
                    dummy_actions = dummy_actions[:obs.shape[0]]
                obs, _, done, _ = env.step(dummy_actions)
        del env

        log_interval = 50
        num_steps = self.hyperparameters.get("pretrain_steps")
        eval_interval = self.hyperparameters.get("pretrain_eval_steps")
        pretrain_fn = self.pretrain_state_from_batch if self.hyperparameters.get(
            "state-only-pretrain") else self.pretrain_from_batch

        eval_fn = self.pretrain_eval_fn if self.hyperparameters.get(
            "state-only-pretrain") else self.get_eval_action

        trailing_loss_values = deque(maxlen=10)
        total = min(len(buffer) // batch_size, num_steps) + 1
        with tqdm(total=total) as t:
            for i, state_batch in zip(range(total), buffer):
                state = unwrap_state(state_batch, device=self.device)
                error = pretrain_fn(state)
                trailing_loss_values.append(error)
                episode_loss = np.mean(list(trailing_loss_values))
                t.set_postfix(loss=episode_loss)
                self.log_scalar("pretrain/loss",
                                episode_loss,
                                global_step=i,
                                interval=log_interval)
                if i % eval_interval == 0 and i:
                    self.post_pretrain_hook()
                    reward = self.evaluate(scope="pretrain",
                                           global_step=i,
                                           eval_fn=eval_fn)
                    if self.hyperparameters.get("state-only-pretrain"):
                        d = {
                            "rnn": self.state_agg.encoder.state_dict(),
                            "optimizer": self.state_optimizer.state_dict(),
                            "embedding": self.embedding.state_dict()
                        }
                        self.pretrain_model_saver.save_model(d,
                                                             i,
                                                             reward,
                                                             scope="valid")
                    else:
                        self.locally_save_policy(
                            scope="valid",
                            reward=reward,
                            step=i,
                            model_saver=self.pretrain_model_saver)
                t.update()