def _prepare_training_data(self, samples): inputs = [] targets_w = [] targets_pi = [] env = Connect4env(width=config.Width, height=config.Height) for sample in samples: inputs.append(utils.format_state(sample[0], env)) targets_pi.append(sample[1]) targets_w.append(sample[2]) return np.vstack(inputs), [np.vstack(targets_w), np.vstack(targets_pi)]
def main(): env = Connect4env() state = utils.format_state(env.get_state(), env) network = Network('test') v, p = network.predict(state) print(v, p) env.step(4) v, p = network.predict(state) print(v, p) network.model.summary()
def search(self, state, reward, result, env, is_search_root=True): if is_search_root: logger.debug('-= A =-') logger.debug('-= NEW =-') logger.debug('-= SEARCH =-') logger.debug('SEARCHING STATE AS PLAYER {}:'.format(env.get_current_player(state=state))) logger.debug(env.to_str(state)) # if the game has reached the end state, return the reward as V, +1 or -1 if result > 0: logger.debug('..........Reached end state return V = {}..........'.format(reward)) return reward state_id = self._state2id(state) # if the state is not in the tree, init the node using network predictions and add it on the tree if state_id not in self.tree: logger.debug('++++++++++Reached a new state++++++++++') v, p = self.network.predict(utils.format_state(state=state, env=env)) valid_action_mask = env.get_valid_actions(state=state) self.A_s[state_id] = valid_action_mask self.P_s[state_id] = p[0] * valid_action_mask self.N_s[state_id] = 0 self.tree.append(state_id) logger.debug(' valid action mask: {}'.format(valid_action_mask)) logger.debug(' masked probabilities: {}'.format(self.P_s[state_id])) logger.debug(' return V = -{}'.format(v)) logger.debug('added this state onto the tree') return -v # if the state is already on the tree, expand the node else: logger.debug('!!!!!!!!!!Expanding an existing state!!!!!!!!!!') # first, select the best action that maximizes U value max_u = float('-inf') best_action = -1 # TODO may need to shuffle the actions, so that it will not always choose the first action when all U are zero # TODO FIX: The dirichlet noise already takes care of that actions = env.get_all_next_actions() if is_search_root: epsilon = config.Dir_Epsilon nu = np.random.dirichlet([config.Dir_Alpha] * len(actions)) else: epsilon = 0 nu = [0] * len(actions) # Find the action with the maximum u for action in actions: if self.A_s[state_id][action] == 1: # if the action is valid state_action_id = self._state_action2id(state, action) if state_action_id in self.Q_sa: logger.debug(' action {} of the current state has been visited before'.format(action)) u = self.Q_sa[state_action_id] + self.Cpuct * ( (1 - epsilon) * self.P_s[state_id][action] + epsilon * nu[action]) * math.sqrt( self.N_s[state_id]) / (1 + self.N_sa[state_action_id]) logger.debug( ' U = Q(s, a) + Cpuct * ((1 - epsilon) * P(s, a) + epsilon * dir(alpha)) * sqrt(N(s)) / (1 + N(s, a)) = {} + {} * ((1 - {}) * {} + {} * {}) * sqrt({}) / (1 + {}) = {}'.format( self.Q_sa[state_action_id], self.Cpuct, epsilon, self.P_s[state_id][action], epsilon, nu[action], self.N_s[state_id], self.N_sa[state_action_id], u )) elif self.N_s[state_id] > 0: logger.debug(' action {} of the current state has never been visited before'.format(action)) u = self.Cpuct * ( (1 - epsilon) * self.P_s[state_id][action] + epsilon * nu[action]) * math.sqrt(self.N_s[state_id]) logger.debug( ' U = Cpuct + ((1 - epsilon) * P(s, a) + epsilon * dir(alpha)) * sqrt(N(s)) = {} + ((1 - {}) * {} + {} * {}) * sqrt({}) = {}'.format( self.Cpuct, epsilon, self.P_s[state_id][action], epsilon, nu[action], self.N_s[state_id], u ) ) else: logger.debug(' action {} of the current state has never been visited before'.format(action)) u = self.Cpuct * ((1 - epsilon) * self.P_s[state_id][action] + epsilon * nu[action]) logger.debug( ' U = Cpuct * ((1 - epsilon) * P(s, a) + epsilon * dir(alpha)) = {} * ((1 - {}) * {} + {} * {}) = {}'.format( self.Cpuct, epsilon, self.P_s[state_id][action], epsilon, nu[action], u )) if u > max_u: max_u = u best_action = action else: logger.debug(' action {} is invalid') logger.debug(' the best action is {}'.format(best_action)) # Now take the best action and continue traversing the tree by invoking the serach method recursively, # which also updates the Q(s, a) and N(s, a) next_state, reward, result = env.simulate(test_state=state, col_idx=best_action) logger.debug('//////////Traverse the state of the best action recursively//////////') v = self.search(state=next_state, reward=reward, result=result, env=env, is_search_root=False) logger.debug('//////////Traverse is done//////////') # Update Q(s, a) and N(s, a) state_action_id = self._state_action2id(state, best_action) # if Q(s, a) already exists if state_action_id in self.Q_sa: old_N_sa = self.N_sa[state_action_id] old_Q_sa = self.Q_sa[state_action_id] # increase N_sa self.N_sa[state_action_id] += 1 logger.debug(' increased N(s, a) from {} to {}'.format(old_N_sa, self.N_sa[state_action_id])) # recalculate the mean of V as Q(s, a) self.Q_sa[state_action_id] = (self.Q_sa[state_action_id] * old_N_sa + v) / self.N_sa[state_action_id] logger.debug(' increased Q(s, a) from {} to {}'.format(old_Q_sa, self.Q_sa[state_action_id])) # if Q(s, a) does not exist, meaning the 'next state' generated by (s, a) is not on the tree else: # init Q(s, a) as the prediction of the network self.Q_sa[state_action_id] = v logger.debug(' init Q(s, a) as {}'.format(v)) self.N_sa[state_action_id] = 1 logger.debug(' init N(s, a) as 1') # increase N(s) because we just expanded state s old_N_s = self.N_s[state_id] self.N_s[state_id] += 1 logger.debug(' increased N(s) from {} to {}'.format(old_N_s, self.N_s[state_id])) return -v
return self.model.predict(x=inputs) def fit(self, inputs, targets, epochs, batch_size, validation_split=0.0, validation_data=None): with self.graph.as_default(): return self.model.fit( x=inputs, y=targets, epochs=epochs, batch_size=batch_size, shuffle=True, verbose=0, validation_split=validation_split, validation_data=validation_data, callbacks=[self.tensorboard, self.checkpoint]) if __name__ == '__main__': env = Connect4Env(width=7, height=6) state = utils.format_state(env.get_state(), env) network = Network('test') v, p = network.predict(state) print(v, p) env.step(4) v, p = network.predict(state) print(v, p)