class QLEnv: def __init__(self, player, nb_rows, nb_cols, timelimit, episode): self.EPSILON = EPS_END + (EPS_START - EPS_END) * (1 - (episode / DECAY_LEN)) self.EPSILON = max(self.EPSILON, EPS_END) self.timelimit = timelimit self.nb_rows = nb_rows self.nb_cols = nb_cols rows = [] for _ in range(nb_rows + 1): columns = [] for _ in range(nb_cols + 1): columns.append({"v": 0, "h": 0}) rows.append(columns) self.cells = rows self.len_states = nb_rows * (nb_cols + 1) + nb_cols * (nb_rows + 1) self.state = np.zeros(self.len_states) self.player = player self.score = [0, 0] self.reward = 0 self.prev_state = None self.dqn = DQN(self.len_states, self.len_states) def reset(self, player, episode): self.episode = episode self.EPSILON = EPS_END + (EPS_START - EPS_END) * (1 - (episode / DECAY_LEN)) self.EPSILON = max(self.EPSILON, EPS_END) self.reward = 0 self.state = np.zeros(self.len_states) self.prev_state = None self.player = player self.score = [0, 0] rows = [] for _ in range(self.nb_rows + 1): columns = [] for _ in range(self.nb_cols + 1): columns.append({"v": 0, "h": 0}) rows.append(columns) self.cells = rows if (self.episode + 1) % SAVE_EVERY == 0: torch.save( self.dqn.model.state_dict(), f"model_self_play_{self.nb_rows}x{self.nb_cols}_{episode+1}.pth" ) def process_next_state(self, score): if self.player == 2: score = score[::-1] # self.reward = score[0] - self.score[0] - score[1] + self.score[1] # self.reward *= 100 self.score = score if self.prev_state is None: return self.dqn.memorize(self.prev_state, self.action, self.reward, self.state) self.dqn.train() def update_state(self, update_prev=False): if update_prev: self.prev_state = self.state.copy() i = 0 for ri in range(self.nb_rows): for ci in range(self.nb_cols + 1): value = self.cells[ri][ci]['v'] if value == self.player: value = 1 elif value != 0: value = -1 self.state[i] = value i += 1 for ri in range(self.nb_rows + 1): for ci in range(self.nb_cols): value = self.cells[ri][ci]['h'] if value == self.player: value = 1 elif value != 0: value = -1 self.state[i] = value i += 1 def register_action(self, row, column, orientation, player): self.cells[row][column][orientation] = player self.update_state(player == self.player) def next_action(self): free_lines = [i for i in range(len(self.state)) if self.state[i] == 0] if len(free_lines) == 0: print('end') return None if np.random.random() > self.EPSILON: moves = np.argsort(self.dqn.predict(self.state)) idx = len(moves) - 1 while moves[idx] not in free_lines: idx -= 1 movei = moves[idx] movei = int(movei) else: movei = np.random.choice(free_lines) movei = int(movei) self.action = movei if movei < (self.nb_cols + 1) * self.nb_rows: o = 'v' r = movei // (self.nb_cols + 1) c = movei % (self.nb_cols + 1) else: movei -= (self.nb_cols + 1) * self.nb_rows o = 'h' r = movei // (self.nb_cols) c = movei % (self.nb_cols) return r, c, o def end_game(self, winner): if winner == self.player: self.reward = 1 elif winner == 0: self.reward = 0 else: self.reward = -1 self.dqn.memorize(self.prev_state, self.action, self.reward, self.state, done=True) self.dqn.train(terminal=True)
class DQNAgent: def __init__(self, player, episode): self.EPSILON = EPS_END + (EPS_START - EPS_END) * (1 - (episode / DECAY_LEN)) self.EPSILON = max(self.EPSILON, EPS_END) self.n_states = 9 self.state = np.zeros(self.n_states, dtype=np.int) self.player = player self.reward = 0 self.prev_state = None self.dqn = DQN(self.n_states + 1, self.n_states) def get_feature(self, state): return list(state) + [2 * (self.player == 1) - 1] def reset(self, player, episode): self.episode = episode self.EPSILON = EPS_END + (EPS_START - EPS_END) * (1 - (episode / DECAY_LEN)) self.EPSILON = max(self.EPSILON, EPS_END) self.reward = 0 self.state = np.zeros(self.n_states, dtype=np.int) self.prev_state = None self.player = player if (episode + 1) % SAVE_EVERY == 0: torch.save(self.dqn.model.state_dict(), f"models/self_play_{episode+1}.pth") def process_next_state(self): if self.prev_state is None: return x = self.get_feature(self.prev_state) x_ = self.get_feature(self.state) self.dqn.memorize(x, self.action, self.reward, x_) self.dqn.train() def register_action(self, row, column, player): if self.player == player: self.prev_state = self.state.copy() self.state[3 * row + column] = {1: 1, 2: -1}[player] def next_action(self): free_lines = [i for i in range(len(self.state)) if self.state[i] == 0] if len(free_lines) == 0: return None if np.random.random_sample() > self.EPSILON: x = self.get_feature(self.state) moves = np.argsort(self.dqn.predict(x)) idx = len(moves) - 1 reward = 0 while moves[idx] not in free_lines: reward = -20 idx -= 1 movei = moves[idx] if reward < 0: x = self.get_feature(self.state) self.dqn.memorize(x, moves[-1], reward, x) self.dqn.train() else: movei = np.random.choice(free_lines) movei = int(movei) self.action = movei r = movei // 3 c = movei % 3 return r, c def end_game(self, winner): if winner == self.player: self.reward += 100 elif winner == 0: self.reward += 0 else: self.reward += -100 x = self.get_feature(self.prev_state) x_ = self.get_feature(self.state) self.dqn.memorize(x, self.action, self.reward, x_, done=True) self.dqn.train(terminal=True)