def _prepare_training_data(self, samples): inputs = [] targets_w = [] targets_pi = [] env = Connect4env(width=config.Width, height=config.Height) for sample in samples: inputs.append(utils.format_state(sample[0], env)) targets_pi.append(sample[1]) targets_w.append(sample[2]) return np.vstack(inputs), [np.vstack(targets_w), np.vstack(targets_pi)]
def main(): env = Connect4env() state = utils.format_state(env.get_state(), env) network = Network('test') v, p = network.predict(state) print(v, p) env.step(4) v, p = network.predict(state) print(v, p) network.model.summary()
def _symmetrize_steps(self, steps): # Since connect 4 boards are symmetric around the middle # vertical axis, this function will 'symmetrize' the steps # to get more training data, thus speed learning env = Connect4env(width=config.Width, height=config.Height) for i in range(len(steps)): state = steps[i][0] prob = steps[i][1] symmetrical_state = env.get_mirror_state(state) symmetrical_prob = prob[::-1] steps.append([ symmetrical_state, symmetrical_prob, steps[i][2], steps[i][3] ]) return steps
def run_episode(self): steps = [] env = Connect4env(width=config.Width, height=config.Height) mct = MCT(network=self.best_network) state = env.get_state() reward = 0 result = 0 while True: # MCTS for i in range(config.MCTS_Num): mct.search(state=state, reward=reward, result=result, env=env) # get PI (probability distribution of actions from current state) from MCT if len(steps) < 10: # At the beginning, we're not certain of the probability, so use the highest temperature possible pi = mct.get_actions_probability(state=state, env=env, temperature=1) else: pi = mct.get_actions_probability(state=state, env=env, temperature=0) # add (state, PI and placeholder for W=value of the state) to memory steps.append([state, pi, None, env.get_current_player()]) # Choose an action based on PI action = np.random.choice(len(pi), p=pi) # take the action state, reward, result = env.step(action) logger.debug(action + 1) logger.debug(env.to_str(state)) # if game is finished, fill back the W placeholder if result != 0: steps = self._assign_w(steps=steps, winner=result) steps = self._symmetrize_steps(steps=steps) break for step in steps: self.memory.append(step) logger.debug('==============================') logger.debug(env.to_str(step[0])) logger.debug('player: {}'.format(step[3])) logger.debug('probabilities: {}'.format(step[1])) logger.debug('value: {}'.format(step[2]))
def compete_for_best_network(self, new_network, best_network): logger.info('Comparing networks...') mct_new = MCT(network=new_network) mct_best = MCT(network=best_network) players = [[mct_new, 0], [mct_best, 0]] env = Connect4env(width=config.Width, height=config.Height) mct_new_wins = 0 mct_best_wins = 0 draw_games = 0 for i in range(config.Compete_Game_Num): env.reset() state = env.get_state() reward = 0 result = 0 step = 0 logger.debug('{} network gets the upper hand for this game'.format( players[step % 2][0].network.name)) while True: for _ in range(config.Test_MCTS_Num): players[step % 2][0].search(state=state, reward=reward, result=result, env=env) prob = players[step % 2][0].get_actions_probability( state=state, env=env, temperature=0) action = np.random.choice(len(prob), p=prob) state, reward, result = env.step(col_idx=action) if result == 1: players[0][1] += 1 break elif result == 2: players[1][1] += 1 break elif result == 3: draw_games += 1 break else: step += 1 logger.debug(env.to_str()) logger.debug(result) if players[0][0] == mct_new: mct_new_wins = players[0][1] mct_best_wins = players[1][1] else: mct_new_wins = players[1][1] mct_best_wins = players[0][1] logger.info(''.join( ('O' * mct_new_wins, 'X' * mct_best_wins, '-' * draw_games, '.' * (config.Compete_Game_Num - i - 1)))) if mct_best_wins / (mct_new_wins + mct_best_wins + (config.Compete_Game_Num - i - 1)) >= ( 1 - config.Best_Network_Threshold): logger.info( 'new network has no hope to win in the comparison, so stop the comparison early.' ) break elif mct_new_wins / (mct_new_wins + mct_best_wins + (config.Compete_Game_Num - i - 1)) > config.Best_Network_Threshold: logger.info( 'new network has already won in the comparison, so stop the comparison early.' ) break else: players.reverse() compete_result = mct_new_wins / (mct_best_wins + mct_new_wins) logger.debug( 'new network won {} games, best network won {} games, draw games: {}' .format(mct_new_wins, mct_best_wins, draw_games)) logger.debug('new network winning ration is {}'.format(compete_result)) is_update = compete_result > config.Best_Network_Threshold if is_update: self.best_network.replace_by(new_network) logger.info('Updated best network!!!') else: logger.info('Discarded current network...') return is_update
if __name__ == '__main__': training_flag = str( input( 'Would you like to train the network before you test it (answer Y or N): ' )).upper() == 'Y' best_network = Network('Best') if training_flag: training = Training(best_network) time.sleep(10) training.train() # ========================================= player = 1 env = Connect4env(width=config.Width, height=config.Height) mct = MCT(network=best_network) reward = 0 result = 0 try: human_player = int( input( 'Would you like to be the 1st player or the 2nd player (answer 1 or 2): ' )) if human_player not in (1, 2): print('Sorry, I don' 't understand your answer. I will play with myself.') except: print('Sorry, I don' 't understand your answer. I will play with myself') human_player = 3
input_dim=(7, 6, 1), output_dim=7, layers_metadata=[{ 'filters': 42, 'kernel_size': (4, 4) }, { 'filters': 42, 'kernel_size': (4, 4) }, { 'filters': 42, 'kernel_size': (4, 4) }], reg_const=0.6, learning_rate=0.0005, root_path=None) env = Connect4env(width=7, height=6) mct = MCT(network=network) player = 1 try: human_player = int( input( 'Would you like to be the 1st player or the 2nd player (answer 1 or 2): ' )) if human_player not in (1, 2): print('Sorry, I don' 't understand your answer. I will play with myself then.') except: print('Sorry, I dont' 't understand your answer. I will play with myself then.') human_player = 3