def __init__(self, reward: float, reward_out: float, sample_size: int, capacity: int, gamma: float, eps_min: float, eps_decay: float, double_q_interval: int = 20): super().__init__() self.rewardOut = reward_out self.reward = reward self.model_network = Network(4) self.target_network = Network(4) self.state = None self.final_state = -np.ones(4) self.action = None self.replayMemory = ReplayMemory(sample_size, capacity) self.gamma = gamma self.eps_greedy_value = 1. self.eps_min = eps_min self.eps_decay = eps_decay self.double_q_interval = double_q_interval self.double_q_counter = 0
def __init__(self, boardsize: int): super().__init__(boardsize) self.score = 0 self.invalid = False self.network = Network(boardsize) self.network.network.eval() self.otherPlayer = None
def __init__(self, board_size: int, rewardInvalidMove: float, rewardWinning: float, rewardLosing: float, sample_size: int, capacity: int, gamma: float, eps_min: float, eps_decay: float, fixed_batch: bool = False, double_q_interval: int = 0): super().__init__(board_size) self.rewardNoScore = 0 self.rewardInvalidMove = rewardInvalidMove self.rewardWinning = rewardWinning self.rewardLosing = rewardLosing self.model_network = Network(board_size) self.target_network = Network(board_size) self.state = None self.final_state = np.ones(board_size ** 2) * 5 self.action = None self.invalid = False self.replayMemory = ReplayMemory(sample_size, capacity) self.gamma = gamma self.fixed_batch = fixed_batch self.eps_greedy_value = 1. self.eps_min = eps_min self.eps_decay = eps_decay self.double_q_interval = double_q_interval self.double_q_counter = 0 self.winner = False
class AIPlayer(Player): score: int invalid: bool network: Network def __init__(self, boardsize: int): super().__init__(boardsize) self.score = 0 self.invalid = False self.network = Network(boardsize) self.network.network.eval() self.otherPlayer = None def get_random_valid_move(self, state: np.array) -> int: self.invalid = False validMoves = np.flatnonzero(state == 0) return np.random.choice(validMoves) def get_move(self, state: np.array) -> int: if not self.invalid: return self.network.get_action(state) else: #or RaiseError? return self.get_random_valid_move(state) def scored(self, newPoints: int): self.score += newPoints def invalidMove(self): self.invalid = True def __str__(self): return "AI player [id: " + str(self.id) + "]"
class AISkier(Skier): def __init__(self, filename: string): super().__init__() self.net = Network(4) self.net.load_weights(filename) self.net.network.eval() def get_action(self, state: PhysicalProperties) -> int: state = self.get_state(state) action = self.net.get_action(state) return self.convert_action(action) def get_state(self, state: PhysicalProperties): return np.array([state.position.x, state.position.y, state.v[0], state.v[1]]) def convert_action(self, action): if action == 0: return -1 if action == 1: return 0 if action == 2: return 1
def __init__(self, filename: string): super().__init__() self.net = Network(4) self.net.load_weights(filename) self.net.network.eval()
class AITrainer(Skier): def __init__(self, reward: float, reward_out: float, sample_size: int, capacity: int, gamma: float, eps_min: float, eps_decay: float, double_q_interval: int = 20): super().__init__() self.rewardOut = reward_out self.reward = reward self.model_network = Network(4) self.target_network = Network(4) self.state = None self.final_state = -np.ones(4) self.action = None self.replayMemory = ReplayMemory(sample_size, capacity) self.gamma = gamma self.eps_greedy_value = 1. self.eps_min = eps_min self.eps_decay = eps_decay self.double_q_interval = double_q_interval self.double_q_counter = 0 def get_random_action(self) -> int: self.action = np.random.choice([0, 1, 2]) return self.action def get_action(self, state: PhysicalProperties) -> int: self.state = self.get_state(state) if np.random.rand() > self.eps_greedy_value: self.action = self.model_network.get_action(self.state) else: self.action = self.get_random_action() return self.convert_action(self.action) def update_eps(self, iteration: int): self.eps_greedy_value = self.eps_min + (1 - self.eps_min) * np.exp( -self.eps_decay * iteration) def train_model_network(self): if self.replayMemory.size < self.replayMemory.sampleSize: return for i in range(2): self.model_network.update_weights(self.replayMemory.get_sample(), self.gamma, self.target_network) self.double_q_counter += 1 if self.double_q_interval == 0: return if self.double_q_counter % self.double_q_interval == 0: self.update_target_network() def update_target_network(self): self.target_network.take_weights(self.model_network) def end(self): self.replayMemory.add_record(self.state, self.action, self.final_state, self.reward, done=True) self.train_model_network() def out(self): self.replayMemory.add_record(self.state, self.action, self.final_state, self.rewardOut, done=True) self.train_model_network() def gate_done(self, next_state: PhysicalProperties): next_state = self.get_state(next_state) self.replayMemory.add_record(self.state, self.action, next_state, self.reward, done=False) def add_record(self, next_state: PhysicalProperties, done: bool): next_state = self.get_state(next_state) self.replayMemory.add_record(self.state, self.action, next_state, reward=0, done=done) def get_state(self, state: PhysicalProperties): return np.array( [state.position.x, state.position.y, state.v[0], state.v[1]]) def convert_action(self, action): if action == 0: return -1 if action == 1: return 0 if action == 2: return 1
class AITrainer(Player): rewardInvalidMove: float rewardWinning: float rewardLosing: float state: np.array action: int invalid: bool model_network: Network target_network: Network replayMemory: ReplayMemory gamma: float fixed_batch: bool eps_greedy_value: float eps_min: float decay: float double_q_interval: int double_q_counter: int def __init__(self, board_size: int, rewardInvalidMove: float, rewardWinning: float, rewardLosing: float, sample_size: int, capacity: int, gamma: float, eps_min: float, eps_decay: float, fixed_batch: bool = False, double_q_interval: int = 0): super().__init__(board_size) self.rewardNoScore = 0 self.rewardInvalidMove = rewardInvalidMove self.rewardWinning = rewardWinning self.rewardLosing = rewardLosing self.model_network = Network(board_size) self.target_network = Network(board_size) self.state = None self.final_state = np.ones(board_size ** 2) * 5 self.action = None self.invalid = False self.replayMemory = ReplayMemory(sample_size, capacity) self.gamma = gamma self.fixed_batch = fixed_batch self.eps_greedy_value = 1. self.eps_min = eps_min self.eps_decay = eps_decay self.double_q_interval = double_q_interval self.double_q_counter = 0 self.winner = False def get_random_valid_move(self, state: np.array) -> int: self.invalid = False validMoves = np.flatnonzero(state == 0) self.action = np.random.choice(validMoves) return self.action def get_move(self, state: np.array) -> int: self.state = state.copy() self.action = self.model_network.get_action(self.state) return self.action # if np.random.rand() > self.eps_greedy_value: # if not self.invalid: # self.action = self.model_network.get_action(self.state) # return self.action # else: # return self.get_random_valid_move(state) # else: # return self.get_random_valid_move(state) def update_eps(self, iteration: int): self.eps_greedy_value = self.eps_min + (1 - self.eps_min) * np.exp(- self.eps_decay * iteration) def invalidMove(self): self.replayMemory.add_record(self.state, self.action, self.final_state, self.rewardInvalidMove, done=True) self.train_model_network() def train_model_network(self): if self.replayMemory.size < self.replayMemory.sampleSize: return for i in range(2): self.model_network.update_weights(self.replayMemory.get_sample(), self.gamma, self.target_network) self.double_q_counter += 1 if self.double_q_interval == 0: return if self.double_q_counter % self.double_q_interval == 0: self.update_target_network() def update_target_network(self): self.target_network.take_weights(self.model_network) # def get_trained_player(self, id_number: int) -> AIPlayer: # trained_network = Network(self.boardsize, self.model_network.hidden, # self.model_network.only_valid_actions, self.model_network.softmax) # trained_network.take_weights(self.model_network) # return AIPlayer(id_number, self.boardsize, trained_network) def win(self): self.replayMemory.add_record(self.state, self.action, self.final_state, self.rewardWinning, done=True) self.train_model_network() self.winner = True def lose(self): self.replayMemory.add_record(self.state, self.action, self.final_state, self.rewardLosing, done=True) self.train_model_network() self.winner = False def add_record(self, next_game_state: np.array, done: bool): self.replayMemory.add_record(self.state, self.action, next_game_state, reward=0, done=done)