def _prepare_training_data(self, samples):
     inputs = []
     targets_w = []
     targets_pi = []
     env = Connect4env(width=config.Width, height=config.Height)
     for sample in samples:
         inputs.append(utils.format_state(sample[0], env))
         targets_pi.append(sample[1])
         targets_w.append(sample[2])
     return np.vstack(inputs), [np.vstack(targets_w), np.vstack(targets_pi)]
Exemple #2
0
def main():
    env = Connect4env()
    state = utils.format_state(env.get_state(), env)
    network = Network('test')
    v, p = network.predict(state)
    print(v, p)
    env.step(4)
    v, p = network.predict(state)
    print(v, p)
    network.model.summary()
Exemple #3
0
    def search(self, state, reward, result, env, is_search_root=True):
        if is_search_root:
            logger.debug('-= A =-')
            logger.debug('-= NEW =-')
            logger.debug('-= SEARCH =-')
        logger.debug('SEARCHING STATE AS PLAYER {}:'.format(env.get_current_player(state=state)))
        logger.debug(env.to_str(state))
        # if the game has reached the end state, return the reward as V, +1 or -1
        if result > 0:
            logger.debug('..........Reached end state return V = {}..........'.format(reward))
            return reward

        state_id = self._state2id(state)

        # if the state is not in the tree, init the node using network predictions and add it on the tree
        if state_id not in self.tree:
            logger.debug('++++++++++Reached a new state++++++++++')
            v, p = self.network.predict(utils.format_state(state=state, env=env))
            valid_action_mask = env.get_valid_actions(state=state)
            self.A_s[state_id] = valid_action_mask
            self.P_s[state_id] = p[0] * valid_action_mask
            self.N_s[state_id] = 0
            self.tree.append(state_id)
            logger.debug('  valid action mask: {}'.format(valid_action_mask))
            logger.debug('  masked probabilities: {}'.format(self.P_s[state_id]))
            logger.debug('  return V = -{}'.format(v))
            logger.debug('added this state onto the tree')
            return -v

        # if the state is already on the tree, expand the node
        else:
            logger.debug('!!!!!!!!!!Expanding an existing state!!!!!!!!!!')
            # first, select the best action that maximizes U value
            max_u = float('-inf')
            best_action = -1
            # TODO may need to shuffle the actions, so that it will not always choose the first action when all U are zero
            # TODO FIX: The dirichlet noise already takes care of that
            actions = env.get_all_next_actions()
            if is_search_root:
                epsilon = config.Dir_Epsilon
                nu = np.random.dirichlet([config.Dir_Alpha] * len(actions))
            else:
                epsilon = 0
                nu = [0] * len(actions)
            # Find the action with the maximum u
            for action in actions:
                if self.A_s[state_id][action] == 1: # if the action is valid
                    state_action_id = self._state_action2id(state, action)
                    if state_action_id in self.Q_sa:
                        logger.debug('  action {} of the current state has been visited before'.format(action))
                        u = self.Q_sa[state_action_id] + self.Cpuct * (
                            (1 - epsilon) * self.P_s[state_id][action] + epsilon * nu[action]) * math.sqrt(
                            self.N_s[state_id]) / (1 + self.N_sa[state_action_id])
                        logger.debug(
                            '   U = Q(s, a) + Cpuct * ((1 - epsilon) * P(s, a) + epsilon * dir(alpha)) * sqrt(N(s)) / (1 + N(s, a)) = {} + {} * ((1 - {}) * {} + {} * {}) * sqrt({}) / (1 + {}) = {}'.format(
                                self.Q_sa[state_action_id], self.Cpuct, epsilon, self.P_s[state_id][action], epsilon,
                                nu[action], self.N_s[state_id], self.N_sa[state_action_id], u
                            ))
                    elif self.N_s[state_id] > 0:
                        logger.debug('  action {} of the current state has never been visited before'.format(action))
                        u = self.Cpuct * (
                            (1 - epsilon) * self.P_s[state_id][action] + epsilon * nu[action]) * math.sqrt(self.N_s[state_id])
                        logger.debug(
                            '   U = Cpuct + ((1 - epsilon) * P(s, a) + epsilon * dir(alpha)) * sqrt(N(s)) = {} + ((1 - {}) * {} + {} * {}) * sqrt({}) = {}'.format(
                                self.Cpuct, epsilon, self.P_s[state_id][action], epsilon, nu[action], self.N_s[state_id], u
                            )
                        )
                    else:
                        logger.debug('  action {} of the current state has never been visited before'.format(action))
                        u = self.Cpuct * ((1 - epsilon) * self.P_s[state_id][action] + epsilon * nu[action])
                        logger.debug(
                            '   U = Cpuct * ((1 - epsilon) * P(s, a) + epsilon * dir(alpha)) = {} * ((1 - {}) * {} + {} * {}) = {}'.format(
                                self.Cpuct, epsilon, self.P_s[state_id][action], epsilon, nu[action], u
                            ))
                    if u > max_u:
                        max_u = u
                        best_action = action
                else:
                    logger.debug('  action {} is invalid')
            logger.debug('  the best action is {}'.format(best_action))

            # Now take the best action and continue traversing the tree by invoking the serach method recursively,
            # which also updates the Q(s, a) and N(s, a)
            next_state, reward, result = env.simulate(test_state=state, col_idx=best_action)
            logger.debug('//////////Traverse the state of the best action recursively//////////')
            v = self.search(state=next_state, reward=reward, result=result, env=env, is_search_root=False)
            logger.debug('//////////Traverse is done//////////')

            # Update Q(s, a) and N(s, a)
            state_action_id = self._state_action2id(state, best_action)
            # if Q(s, a) already exists
            if state_action_id in self.Q_sa:
                old_N_sa = self.N_sa[state_action_id]
                old_Q_sa = self.Q_sa[state_action_id]
                # increase N_sa
                self.N_sa[state_action_id] += 1
                logger.debug('  increased N(s, a) from {} to {}'.format(old_N_sa, self.N_sa[state_action_id]))
                # recalculate the mean of V as Q(s, a)
                self.Q_sa[state_action_id] = (self.Q_sa[state_action_id] * old_N_sa + v) / self.N_sa[state_action_id]
                logger.debug('  increased Q(s, a) from {} to {}'.format(old_Q_sa, self.Q_sa[state_action_id]))
            # if Q(s, a) does not exist, meaning the 'next state' generated by (s, a) is not on the tree
            else:
                # init Q(s, a) as the prediction of the network
                self.Q_sa[state_action_id] = v
                logger.debug('  init Q(s, a) as {}'.format(v))
                self.N_sa[state_action_id] = 1
                logger.debug('  init N(s, a) as 1')
            # increase N(s) because we just expanded state s
            old_N_s = self.N_s[state_id]
            self.N_s[state_id] += 1
            logger.debug('  increased N(s) from {} to {}'.format(old_N_s, self.N_s[state_id]))
            return -v
            return self.model.predict(x=inputs)

    def fit(self,
            inputs,
            targets,
            epochs,
            batch_size,
            validation_split=0.0,
            validation_data=None):
        with self.graph.as_default():
            return self.model.fit(
                x=inputs,
                y=targets,
                epochs=epochs,
                batch_size=batch_size,
                shuffle=True,
                verbose=0,
                validation_split=validation_split,
                validation_data=validation_data,
                callbacks=[self.tensorboard, self.checkpoint])


if __name__ == '__main__':
    env = Connect4Env(width=7, height=6)
    state = utils.format_state(env.get_state(), env)
    network = Network('test')
    v, p = network.predict(state)
    print(v, p)
    env.step(4)
    v, p = network.predict(state)
    print(v, p)