Beispiel #1
0
class DeepQLearner (BaseLearner):
    def __init__(self):
        super(DeepQLearner, self).__init__()
        self.discount_rate = 0.95
        self.network = QNetwork()
        self.database = ExperienceDatabase(100000)
        self.weights = []

    def decide_action(self, new_state, possible_moves):
        # compute Q-scores with forward-propagation
        scores = np.zeros(len(possible_moves))
        for i, move in enumerate(possible_moves):
            scores[i] = self.network.use_model(new_state, move)

        # train neural-network
        if self.last_state is not None:
            self.database.add(self.last_state, self.last_action, self.last_reward, new_state, possible_moves)

            past_last_state, past_last_action, past_last_reward, past_new_state, past_possible_moves = self.database.sample(1)[0]

            best_score = None
            for move in past_possible_moves:
                best_score = max(best_score, self.network.use_model(past_new_state, move))

            # update weights with back-propagation
            self.network.update_model(past_last_state, past_last_action, float(past_last_reward + self.discount_rate * best_score))

        return possible_moves[self.explorer.decide_action(self.epoch, scores)]

    def end_epoch(self, score):
        super(DeepQLearner, self).end_epoch(score)

        #save the network weights at this epoch
        if self.epoch % 1000 == 0:
            self.weights.append(self.network.get_all_weights())
Beispiel #2
0
class DeepQLearner (BaseLearner):
    def __init__(self):
        super(DeepQLearner, self).__init__()
        self.discount_rate = 0.95
        self.network = QNetwork()
        self.weights = []

    def decide_action(self, new_state, possible_moves):
        if not self.last_state:
           return npr.choice(possible_moves)

        # compute Q-scores with forward-propagation
        list_Qscore = []
        for move in possible_moves:
            list_Qscore.append(self.network.use_model(new_state, move))

        # get the best action & Q-score from current state
        best_Qscore = max(list_Qscore)
        # best_move = possible_moves[list_Qscore.index(best_Qscore)]

        # update weights with back-propagation
        self.network.update_model(self.last_state, self.last_action, float(self.last_reward + self.discount_rate * best_Qscore))

        return possible_moves[self.explorer.decide_action(self.epoch, np.asarray(list_Qscore))]

    def end_epoch(self, score):
        super(DeepQLearner, self).end_epoch(score)

        #save the network weights at this epoch
        if self.epoch % 1000 == 0:
            self.weights.append(self.network.get_all_weights())
Beispiel #3
0
 def __init__(self):
     super(DeepQLearner, self).__init__()
     self.discount_rate = 0.95
     self.network = QNetwork()
     self.database = ExperienceDatabase(100000)
     self.weights = []
Beispiel #4
0
 def __init__(self):
     super(DeepQLearner, self).__init__()
     self.discount_rate = 0.95
     self.network = QNetwork()
     self.weights = []
     self.last_sym_index = None
Beispiel #5
0
class DeepQLearner (BaseLearner):
    def __init__(self):
        super(DeepQLearner, self).__init__()
        self.discount_rate = 0.95
        self.network = QNetwork()
        self.weights = []

    def decide_action(self, new_state, possible_moves):
        if not self.last_state:
           return npr.choice(possible_moves)

        valid_rotations = self.get_rotated_boards(new_state,possible_moves)

        # compute Q-scores with forward-propagation
        list_Qscore = []
        for some_state in valid_rotations:
            list_Qscore.append(self.network.use_model(some_state))

        # get the best action & Q-score from current state
        best_Qscore = max(list_Qscore)
        # best_move = possible_moves[list_Qscore.index(best_Qscore)]

        # update weights with back-propagation
        self.network.update_model((self.get_rotated_boards(self.last_state, [self.last_action])[0]), float(self.last_reward + self.discount_rate * best_Qscore))

        return possible_moves[self.explorer.decide_action(self.epoch, np.asarray(list_Qscore))]

    def end_epoch(self, score):
        super(DeepQLearner, self).end_epoch(score)

        #save the network weights at this epoch
        if self.epoch % 1000 == 0:
            self.weights.append(self.network.get_all_weights())

    def get_rotated_boards(self, this_state, which_rotations):
        #for which_rotations:
        #1 gives 180 degree turn
        #2 gives 90 degree turn clockwise (possible_move left for self.last_action for original state!!!)
        #3 gives 90 degree turn CCW

        #returns list of processed tuples

        rotated_boards = []

        for move in which_rotations:
            if move == 0:
                rotated_boards.append(this_state)
            elif move == 1:
                new_board = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                map_dictionary = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
                for i in range(16):
                    new_board[i] = this_state[map_dictionary[i]]
                rotated_boards.append(tuple(new_board))
            elif move == 2:
                new_board = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                map_dictionary = [12,8,4,0,13,9,5,1,14,10,6,2,15,11,7,3]
                for i in range(16):
                    new_board[i] = this_state[map_dictionary[i]]
                rotated_boards.append(tuple(new_board))
            elif move == 3:
                new_board = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                map_dictionary = [3,7,11,15,2,6,10,14,1,5,9,13,0,4,8,12]
                for i in range(16):
                    new_board[i] = this_state[map_dictionary[i]]
                rotated_boards.append(tuple(new_board))
        return rotated_boards