Ejemplo n.º 1
0
class QLEnv:
    def __init__(self, player, nb_rows, nb_cols, timelimit, episode):

        self.EPSILON = EPS_END + (EPS_START -
                                  EPS_END) * (1 - (episode / DECAY_LEN))
        self.EPSILON = max(self.EPSILON, EPS_END)
        self.timelimit = timelimit
        self.nb_rows = nb_rows
        self.nb_cols = nb_cols
        rows = []
        for _ in range(nb_rows + 1):
            columns = []
            for _ in range(nb_cols + 1):
                columns.append({"v": 0, "h": 0})
            rows.append(columns)
        self.cells = rows
        self.len_states = nb_rows * (nb_cols + 1) + nb_cols * (nb_rows + 1)
        self.state = np.zeros(self.len_states)
        self.player = player
        self.score = [0, 0]
        self.reward = 0
        self.prev_state = None
        self.dqn = DQN(self.len_states, self.len_states)

    def reset(self, player, episode):
        self.episode = episode
        self.EPSILON = EPS_END + (EPS_START -
                                  EPS_END) * (1 - (episode / DECAY_LEN))
        self.EPSILON = max(self.EPSILON, EPS_END)
        self.reward = 0
        self.state = np.zeros(self.len_states)
        self.prev_state = None
        self.player = player
        self.score = [0, 0]
        rows = []
        for _ in range(self.nb_rows + 1):
            columns = []
            for _ in range(self.nb_cols + 1):
                columns.append({"v": 0, "h": 0})
            rows.append(columns)
        self.cells = rows
        if (self.episode + 1) % SAVE_EVERY == 0:
            torch.save(
                self.dqn.model.state_dict(),
                f"model_self_play_{self.nb_rows}x{self.nb_cols}_{episode+1}.pth"
            )

    def process_next_state(self, score):
        if self.player == 2:
            score = score[::-1]
        # self.reward = score[0] - self.score[0] - score[1] + self.score[1]
        # self.reward *= 100
        self.score = score
        if self.prev_state is None:
            return
        self.dqn.memorize(self.prev_state, self.action, self.reward,
                          self.state)
        self.dqn.train()

    def update_state(self, update_prev=False):
        if update_prev:
            self.prev_state = self.state.copy()
        i = 0
        for ri in range(self.nb_rows):
            for ci in range(self.nb_cols + 1):
                value = self.cells[ri][ci]['v']
                if value == self.player:
                    value = 1
                elif value != 0:
                    value = -1
                self.state[i] = value
                i += 1
        for ri in range(self.nb_rows + 1):
            for ci in range(self.nb_cols):
                value = self.cells[ri][ci]['h']
                if value == self.player:
                    value = 1
                elif value != 0:
                    value = -1
                self.state[i] = value
                i += 1

    def register_action(self, row, column, orientation, player):
        self.cells[row][column][orientation] = player
        self.update_state(player == self.player)

    def next_action(self):
        free_lines = [i for i in range(len(self.state)) if self.state[i] == 0]
        if len(free_lines) == 0:
            print('end')
            return None
        if np.random.random() > self.EPSILON:
            moves = np.argsort(self.dqn.predict(self.state))
            idx = len(moves) - 1
            while moves[idx] not in free_lines:
                idx -= 1
            movei = moves[idx]
            movei = int(movei)
        else:
            movei = np.random.choice(free_lines)
            movei = int(movei)
        self.action = movei
        if movei < (self.nb_cols + 1) * self.nb_rows:
            o = 'v'
            r = movei // (self.nb_cols + 1)
            c = movei % (self.nb_cols + 1)
        else:
            movei -= (self.nb_cols + 1) * self.nb_rows
            o = 'h'
            r = movei // (self.nb_cols)
            c = movei % (self.nb_cols)
        return r, c, o

    def end_game(self, winner):
        if winner == self.player:
            self.reward = 1
        elif winner == 0:
            self.reward = 0
        else:
            self.reward = -1
        self.dqn.memorize(self.prev_state,
                          self.action,
                          self.reward,
                          self.state,
                          done=True)
        self.dqn.train(terminal=True)
Ejemplo n.º 2
0
class DQNAgent:
    def __init__(self, player, episode):

        self.EPSILON = EPS_END + (EPS_START -
                                  EPS_END) * (1 - (episode / DECAY_LEN))
        self.EPSILON = max(self.EPSILON, EPS_END)
        self.n_states = 9
        self.state = np.zeros(self.n_states, dtype=np.int)
        self.player = player
        self.reward = 0
        self.prev_state = None
        self.dqn = DQN(self.n_states + 1, self.n_states)

    def get_feature(self, state):
        return list(state) + [2 * (self.player == 1) - 1]

    def reset(self, player, episode):
        self.episode = episode
        self.EPSILON = EPS_END + (EPS_START -
                                  EPS_END) * (1 - (episode / DECAY_LEN))
        self.EPSILON = max(self.EPSILON, EPS_END)
        self.reward = 0
        self.state = np.zeros(self.n_states, dtype=np.int)
        self.prev_state = None
        self.player = player
        if (episode + 1) % SAVE_EVERY == 0:
            torch.save(self.dqn.model.state_dict(),
                       f"models/self_play_{episode+1}.pth")

    def process_next_state(self):
        if self.prev_state is None:
            return
        x = self.get_feature(self.prev_state)
        x_ = self.get_feature(self.state)
        self.dqn.memorize(x, self.action, self.reward, x_)
        self.dqn.train()

    def register_action(self, row, column, player):
        if self.player == player:
            self.prev_state = self.state.copy()
        self.state[3 * row + column] = {1: 1, 2: -1}[player]

    def next_action(self):
        free_lines = [i for i in range(len(self.state)) if self.state[i] == 0]
        if len(free_lines) == 0:
            return None
        if np.random.random_sample() > self.EPSILON:
            x = self.get_feature(self.state)
            moves = np.argsort(self.dqn.predict(x))
            idx = len(moves) - 1
            reward = 0
            while moves[idx] not in free_lines:
                reward = -20
                idx -= 1
            movei = moves[idx]
            if reward < 0:
                x = self.get_feature(self.state)
                self.dqn.memorize(x, moves[-1], reward, x)
                self.dqn.train()
        else:
            movei = np.random.choice(free_lines)
        movei = int(movei)
        self.action = movei
        r = movei // 3
        c = movei % 3
        return r, c

    def end_game(self, winner):
        if winner == self.player:
            self.reward += 100
        elif winner == 0:
            self.reward += 0
        else:
            self.reward += -100
        x = self.get_feature(self.prev_state)
        x_ = self.get_feature(self.state)
        self.dqn.memorize(x, self.action, self.reward, x_, done=True)
        self.dqn.train(terminal=True)