Beispiel #1
0
 def act(self, state, available=None, step=0):
     """
     select an action in the provided state
     :param state: state to select action in, must be provided in non-linearized, permutation form
     :param available: vector containing the actions that lead to non-zero reward in this step in one-hot encoding
     :param step: count how many episodes have been performed to evaluate the epsilon-greedy look-ahead
     :return: action to perform in the specific state
     """
     # Compute the value for epsilon depending on the step
     if step < self.steps_epsilon:
         epsilon = 1 - (1 - self.epsilon_end) * (step / self.steps_epsilon)
     else:
         epsilon = self.epsilon_end
     # depending on the epsilon value, select an action ...
     if self.look_ahead_search and epsilon > 0 and random.uniform(
             0, 1) < epsilon:
         # ... randomly or ...
         action = self.look_ahead()
         value, probs = self.net.forward(
             linearize_state(state, self.num_seqs))
         log_prob = torch.log(probs[action])
     else:
         # ... depending on the network state
         action, value, log_prob = self.training_agent.act(
             linearize_state(state, self.num_seqs),
             available if self.supported_search else None)
     return action, value, log_prob
Beispiel #2
0
    def train(self, print_progress):
        """
        Performs the learning process.
        :return: the returns, losses and invalid action ratios computed during the training process
            (usable for analytical and optimization tasks)
        """
        episode_reward, episode_loss, episode_fails = 0, (0, 0), 0
        avg_rewards, avg_losses, avg_fails = [], [], []

        # play as many games as specified for the training
        for step in range(self.games):
            # print the progress the model made while learning
            if (step +
                    1) % self.plot_size == 0 or self.env.align_table.is_full():
                tmp_reward, tmp_loss, tmp_fail = self.print_progress(
                    print_progress, step, episode_reward, episode_loss,
                    episode_fails)
                avg_rewards.append(tmp_reward)
                avg_losses.append(tmp_loss)
                avg_fails.append(tmp_fail)
                episode_reward, episode_loss, episode_fails = 0, (0, 0), 0

                # if all alignments have been found exit
                if self.env.align_table.is_full():
                    if self.env.best_alignment == (Profile([]), None):
                        self.env.best_alignment = self.env.align_table.get_best(
                            self.score)
                    if print_progress:
                        print(
                            "Search exited. All alignments have been visited and optimality is guaranteed."
                        )
                    break

            game_reward, state, _, done = self.env.soft_reset()

            # play new game
            tmp_erb = []
            while not done:
                # compute the action, perform it in the environment and add all stats to the local replay-buffer
                action, value, log_prob = self.act(state, self.env.available)
                prev_state = np.array(state)
                game_reward, state, _, done = self.env.step(action)
                tmp_erb.append(
                    (linearize_state(prev_state, self.num_seqs), action,
                     game_reward[self.score],
                     linearize_state(state,
                                     self.num_seqs), done, value, log_prob))

            # update reward according to the received reward
            episode_reward += game_reward[self.score]

            if not self.refinement and len(state) != self.num_seqs:
                episode_fails += 1

            # learn from the played game
            episode_loss = [
                e + l for e, l in zip(episode_loss, self.learn(tmp_erb))
            ]
        return avg_rewards, avg_losses, avg_fails
Beispiel #3
0
 def get_state_estimate(self, state):
     """
     get estimate of actual state based on previous learning
     :param state: actual state to estimate
     :return: state estimate
     """
     return self.net.forward(linearize_state(state, self.num_seqs))[0]
Beispiel #4
0
 def get_state_estimate(self, state):
     """
     Estimate the state based on the actual network-state using its state-estimation output-neuron
     :param state: state to estimate
     :return: estimation of the expected reward form that state on
     """
     return self.net.forward(linearize_state(state, self.num_seqs))[0]
 def act(self, state, available=None, step=0):
     """
     Selects an action based on the given input-state.
     :param state: state to select action for, must be provided in non-linearized, permutation form
     :param available: vector containing the actions that lead to non-zero reward in this step in one-hot encoding
     :param step: step in which this action is made, needed for epsilon-greediness
     :returns: the selected action
     """
     # Compute the value for epsilon depending on the step
     if step < self.steps_epsilon:
         epsilon = 1 - (1 - self.epsilon_end) * (step / self.steps_epsilon)
     else:
         epsilon = self.epsilon_end
     # depending on the epsilon value, select an action...
     if epsilon > 0 and random.uniform(0, 1) < epsilon:
         if self.look_ahead_search and random.uniform(0, 1) < 0.5:
             action = self.look_ahead()
         else:
             # ...randomly
             action = random.choice(
                 list(set(range(self.num_seqs)) -
                      set(state))) - (1 if self.refinement else 0)
     else:
         # ...depending on the network state
         action = self.training_agent.act(
             linearize_state(state, self.num_seqs),
             available if self.supported_search else None)
     return action
Beispiel #6
0
 def select(self, state):
     """
     Override the select-method from the super-class (needed for final alignment)
     :param state: state of alignment in non-linearized permutation form
     :return: action to select in the actual state
     """
     return self.act(linearize_state(state, self.num_seqs))
Beispiel #7
0
 def select(self, state):
     """
     select the next action according to the actual network state
     needed to override the parent method
     :param state: state to select next sequence from in non-linearized form (just as permutation)
     :return: selected action as index of next sequence
     """
     return self.act(linearize_state(state, self.num_seqs))
Beispiel #8
0
    def select(self, state):
        """
        Select an action based on the input state
        :param state: non-linearized state to select the action for
        :return: action selected based on the probabilities got from the net
        """
        probs = self.net.forward(linearize_state(state, self.num_seqs))[1].detach().numpy()
        action = np.random.choice(range(self.num_seqs), p=probs)

        if self.refinement:
            action -= 1

        return action
Beispiel #9
0
    def select(self, state):
        """
        select the next action according to the actual network state
        needed to override the parent method
        :param state: state to select next sequence from in non-linearized form (just as permutation)
        :return: selected action as index of next sequence
        """
        action = self.net.forward(linearize_state(
            state, self.num_seqs))[1].argmax().item()

        if self.refinement:
            action -= 1

        return action
    def generate_mcts_episode(self):
        """
        Generate training data based on which actions the network would perform and
        what an MCTS-Supervisor thinks about the resulting states
        :return:
        """
        replay_buffer = []
        _, state, _, done = self.env.soft_reset()

        while not done:
            # evaluate the state using mcts to get move-probabilities and a state estimate
            probs, s_est = self.mcts_generator.get_probabilities(state)

            # use the networks actual state to select the next action
            with torch.no_grad():
                action = self.training_agent.select(state)

            # append the data to the replay-buffer
            replay_buffer.append((linearize_state(state, self.num_seqs), probs, s_est))

            # and apply the selected action to the state
            _, state, _, done = self.env.step(action)
        return replay_buffer