def next_state(state, action): grid = convert_string_to_list(state) move = convert_string_to_list(action) grid[move[0][0]][move[0][1]] = 0 grid[move[1][0]][move[1][1]] = 0 grid[move[2][0]][move[2][1]] = 1 return convert_list_to_string(grid)
def update_model_and_eligibilities(self, state, target, td_error): features = convert_string_to_list(state.replace(',', ''))[0] self.fit(features, target, td_error) # train model # decay eligibilities for i in range(len(self.eligibilities)): self.eligibilities[i] = self.eligibilities[i] * self.eligibility_decay_rate
def find_value(self, state): input_state = convert_string_to_list(state.replace(',', ''))[0] input_state = np.array([input_state]) predictions = self.model(input_state) return predictions.numpy()[0][0]
def run(self): # reset history of pegs left self.total_pegs_left_per_episode = [] for i in range(self.config['number_of_episodes']): print('Episode ', i) # initialize SimWorld: PegBoard and PegPlayer peg_board = PegBoard(self.config['size'], self.config['is_diamond'], self.config['empty_nodes']) peg_player = PegPlayer(peg_board, self.config['reward_win'], self.config['reward_lose']) # whether this episode should be displayed or not display = self.config['display_games'] == "all" or ( self.config['display_games'] == "last" and i == self.config['number_of_episodes'] - 1) # set epsilon to zero for last episode if desired if self.config['epsilon_zero_on_last_episode'] and i == self.config['number_of_episodes'] - 1: self.actor.set_epsilon_to_zero() # get initial state state = convert_list_to_string(peg_board.grid) # Actor: choose first action action = self.actor.choose_action(state) # if action == None --> no legal actions for this board configuration if not action: print('No legal actions!') break if display: visualize_board(convert_string_to_list(state)) # reset eligibilities self.actor.reset_episode_parameters() # this method will also decrease epsilon self.critic.reset_episode_parameters() while not peg_player.game_over(): # set eligibilities to 1 for current state (and action for actor). For critic, only if table based. self.actor.set_eligibility(state, action) if self.config['critic_table']: self.critic.set_eligibility(state) # execute action, receive next state and reward from PegPlayer in SimWorld next_state, reward = peg_player.execute_action(action) # Actor: choose next action if game is not over if not peg_player.game_over(): next_action = self.actor.choose_action(next_state) else: next_action = None # Critic: compute TD error and update values/model and eligibilities if self.config['critic_table']: td_error = self.critic.get_TD_error(state, next_state, reward) self.critic.update_values_and_eligibilities(td_error) else: target, td_error = self.critic.get_target_and_TD_error(state, next_state, reward) self.critic.update_model_and_eligibilities(state, target, td_error) # Actor: use TD error to update SAP values and eligibilities self.actor.update_values_and_eligibilities(td_error) state = next_state action = next_action # visualize game is display flag is True if display: sleep(self.config['display_delay']) visualize_board(convert_string_to_list(state)) # save result for plotting self.total_pegs_left_per_episode.append(peg_board.total_pegs_left) # print result if last episode if i == self.config['number_of_episodes'] - 1: print('Total pegs left last episode - ', peg_board.total_pegs_left)
def get_possible_actions(state): grid = convert_string_to_list(state) possible_actions = [] size = len(grid) for i in range(size): for j in range(size if len(grid[0]) > 1 else i + 1): if grid[i][j] == 1: if len(grid[0]) > 1: # Grid is diamond shaped # Direction: up if i > 1 and grid[i - 1][j] == 1 and grid[i - 2][j] == 0: possible_actions.append( str(i) + str(j) + "," + str(i - 1) + str(j) + "," + str(i - 2) + str(j)) # Direction: right if j < size - 2 and grid[i][j + 1] == 1 and grid[i][j + 2] == 0: possible_actions.append( str(i) + str(j) + "," + str(i) + str(j + 1) + "," + str(i) + str(j + 2)) else: # Grid is triangle shaped # Direction: up if i > 1 and j < len(grid[i]) - 2 and grid[ i - 1][j] == 1 and grid[i - 2][j] == 0: possible_actions.append( str(i) + str(j) + "," + str(i - 1) + str(j) + "," + str(i - 2) + str(j)) # Direction: right if i > 1 and j < len(grid[i]) - 2 and grid[i][ j + 1] == 1 and grid[i][j + 2] == 0: possible_actions.append( str(i) + str(j) + "," + str(i) + str(j + 1) + "," + str(i) + str(j + 2)) # Direction: down & right if i < size - 2 and j < size - 2 and grid[i + 1][ j + 1] == 1 and grid[i + 2][j + 2] == 0: possible_actions.append( str(i) + str(j) + "," + str(i + 1) + str(j + 1) + "," + str(i + 2) + str(j + 2)) # Direction: down if i < size - 2 and grid[i + 1][j] == 1 and grid[i + 2][j] == 0: possible_actions.append( str(i) + str(j) + "," + str(i + 1) + str(j) + "," + str(i + 2) + str(j)) # Direction: left if j > 1 and grid[i][j - 1] == 1 and grid[i][j - 2] == 0: possible_actions.append( str(i) + str(j) + "," + str(i) + str(j - 1) + "," + str(i) + str(j - 2)) # Direction: up & left if i > 1 and j > 1 and grid[i - 1][j - 1] == 1 and grid[i - 2][ j - 2] == 0: possible_actions.append( str(i) + str(j) + "," + str(i - 1) + str(j - 1) + "," + str(i - 2) + str(j - 2)) return possible_actions