def act(self, state, available=None, step=0): """ select an action in the provided state :param state: state to select action in, must be provided in non-linearized, permutation form :param available: vector containing the actions that lead to non-zero reward in this step in one-hot encoding :param step: count how many episodes have been performed to evaluate the epsilon-greedy look-ahead :return: action to perform in the specific state """ # Compute the value for epsilon depending on the step if step < self.steps_epsilon: epsilon = 1 - (1 - self.epsilon_end) * (step / self.steps_epsilon) else: epsilon = self.epsilon_end # depending on the epsilon value, select an action ... if self.look_ahead_search and epsilon > 0 and random.uniform( 0, 1) < epsilon: # ... randomly or ... action = self.look_ahead() value, probs = self.net.forward( linearize_state(state, self.num_seqs)) log_prob = torch.log(probs[action]) else: # ... depending on the network state action, value, log_prob = self.training_agent.act( linearize_state(state, self.num_seqs), available if self.supported_search else None) return action, value, log_prob
def train(self, print_progress): """ Performs the learning process. :return: the returns, losses and invalid action ratios computed during the training process (usable for analytical and optimization tasks) """ episode_reward, episode_loss, episode_fails = 0, (0, 0), 0 avg_rewards, avg_losses, avg_fails = [], [], [] # play as many games as specified for the training for step in range(self.games): # print the progress the model made while learning if (step + 1) % self.plot_size == 0 or self.env.align_table.is_full(): tmp_reward, tmp_loss, tmp_fail = self.print_progress( print_progress, step, episode_reward, episode_loss, episode_fails) avg_rewards.append(tmp_reward) avg_losses.append(tmp_loss) avg_fails.append(tmp_fail) episode_reward, episode_loss, episode_fails = 0, (0, 0), 0 # if all alignments have been found exit if self.env.align_table.is_full(): if self.env.best_alignment == (Profile([]), None): self.env.best_alignment = self.env.align_table.get_best( self.score) if print_progress: print( "Search exited. All alignments have been visited and optimality is guaranteed." ) break game_reward, state, _, done = self.env.soft_reset() # play new game tmp_erb = [] while not done: # compute the action, perform it in the environment and add all stats to the local replay-buffer action, value, log_prob = self.act(state, self.env.available) prev_state = np.array(state) game_reward, state, _, done = self.env.step(action) tmp_erb.append( (linearize_state(prev_state, self.num_seqs), action, game_reward[self.score], linearize_state(state, self.num_seqs), done, value, log_prob)) # update reward according to the received reward episode_reward += game_reward[self.score] if not self.refinement and len(state) != self.num_seqs: episode_fails += 1 # learn from the played game episode_loss = [ e + l for e, l in zip(episode_loss, self.learn(tmp_erb)) ] return avg_rewards, avg_losses, avg_fails
def get_state_estimate(self, state): """ get estimate of actual state based on previous learning :param state: actual state to estimate :return: state estimate """ return self.net.forward(linearize_state(state, self.num_seqs))[0]
def get_state_estimate(self, state): """ Estimate the state based on the actual network-state using its state-estimation output-neuron :param state: state to estimate :return: estimation of the expected reward form that state on """ return self.net.forward(linearize_state(state, self.num_seqs))[0]
def act(self, state, available=None, step=0): """ Selects an action based on the given input-state. :param state: state to select action for, must be provided in non-linearized, permutation form :param available: vector containing the actions that lead to non-zero reward in this step in one-hot encoding :param step: step in which this action is made, needed for epsilon-greediness :returns: the selected action """ # Compute the value for epsilon depending on the step if step < self.steps_epsilon: epsilon = 1 - (1 - self.epsilon_end) * (step / self.steps_epsilon) else: epsilon = self.epsilon_end # depending on the epsilon value, select an action... if epsilon > 0 and random.uniform(0, 1) < epsilon: if self.look_ahead_search and random.uniform(0, 1) < 0.5: action = self.look_ahead() else: # ...randomly action = random.choice( list(set(range(self.num_seqs)) - set(state))) - (1 if self.refinement else 0) else: # ...depending on the network state action = self.training_agent.act( linearize_state(state, self.num_seqs), available if self.supported_search else None) return action
def select(self, state): """ Override the select-method from the super-class (needed for final alignment) :param state: state of alignment in non-linearized permutation form :return: action to select in the actual state """ return self.act(linearize_state(state, self.num_seqs))
def select(self, state): """ select the next action according to the actual network state needed to override the parent method :param state: state to select next sequence from in non-linearized form (just as permutation) :return: selected action as index of next sequence """ return self.act(linearize_state(state, self.num_seqs))
def select(self, state): """ Select an action based on the input state :param state: non-linearized state to select the action for :return: action selected based on the probabilities got from the net """ probs = self.net.forward(linearize_state(state, self.num_seqs))[1].detach().numpy() action = np.random.choice(range(self.num_seqs), p=probs) if self.refinement: action -= 1 return action
def select(self, state): """ select the next action according to the actual network state needed to override the parent method :param state: state to select next sequence from in non-linearized form (just as permutation) :return: selected action as index of next sequence """ action = self.net.forward(linearize_state( state, self.num_seqs))[1].argmax().item() if self.refinement: action -= 1 return action
def generate_mcts_episode(self): """ Generate training data based on which actions the network would perform and what an MCTS-Supervisor thinks about the resulting states :return: """ replay_buffer = [] _, state, _, done = self.env.soft_reset() while not done: # evaluate the state using mcts to get move-probabilities and a state estimate probs, s_est = self.mcts_generator.get_probabilities(state) # use the networks actual state to select the next action with torch.no_grad(): action = self.training_agent.select(state) # append the data to the replay-buffer replay_buffer.append((linearize_state(state, self.num_seqs), probs, s_est)) # and apply the selected action to the state _, state, _, done = self.env.step(action) return replay_buffer