def run_process(args, share_model, board_max, n_rows, rank): from checkerboard import Checkerboard, BoardRender board = Checkerboard(board_max, n_rows) board_render = BoardRender(board_max, render_off=True, inline_draw=True) board_render.clear() data_buffer = deque(maxlen=100000) Ts = [] Trewards = [] TQmax = [] for episode in range(1000): random.seed(time.time()) board.reset() board_render.clear() """ start a self-play game using a MCTS player, reuse the search tree store the self-play data: (state, mcts_probs, z) """ p1, p2 = board.players states, mcts_probs, current_players = [], [], [] for step in range(10000): if len(data_buffer) > 32: loss, entropy = agent.learn(data_buffer) # print('loss : ',loss,' entropy : ',entropy) move, move_probs = agent.get_action(board, temp=1.0, return_prob=1) # store the data states.append(board.current_state()) mcts_probs.append(move_probs) current_players.append(board.current_player) # perform a move board.step(move) board_render.draw(board.states) end, winner = board.game_end() if end: # winner from the perspective of the current player of each state winners_z = np.zeros(len(current_players)) if winner != -1: winners_z[np.array(current_players) == winner] = 1.0 winners_z[np.array(current_players) != winner] = -1.0 #reset MCTS root node agent.reset_player() if winner != -1: print("Game end. Winner is player:", winner) else: print("Game end. Tie") # return winner, zip(states, mcts_probs, winners_z) play_data = zip(states, mcts_probs, winners_z) ex_play_data = get_equi_data(play_data, board_max, board_max) data_buffer.extend(ex_play_data) break episode += 1
def run_process(args, share_model, board_max, n_rows, rank): from checkerboard import Checkerboard, BoardRender board = Checkerboard(board_max, n_rows) board_render = BoardRender(board_max, render_off=False, inline_draw=True) board_render.clear() board_render.draw(board.states) for episode in range(1): random.seed(time.time()) board.reset() board_render.clear() """ start a self-play game using a MCTS player, reuse the search tree store the self-play data: (state, mcts_probs, z) """ p1, p2 = board.players player = input('select player 1: balck , 2 : white') if player == '1': play_step = 0 else: play_step = 1 for step in range(10000): if step % 2 == play_step: ss = input('input x,y:') pos = ss.split(',') if pos == 'q': return move = int(pos[0]) + int(pos[1]) * board_max print('movd ', move) else: move, move_probs = agent.get_action(board, temp=1.0, return_prob=1) board.step(move) board_render.draw(board.states) end, winner = board.game_end() if end: # winner from the perspective of the current player of each state agent.reset_player() if winner != -1: print("Game end. Winner is player:", winner) else: print("Game end. Tie") # return winner, zip(states, mcts_probs, winners_z) break episode += 1
def human_process(args, share_model, rank, self_play, shared_lr_mul, shared_g_cnt, shared_q, lock): print('human play') self_play = False board_max = args.board_max from agent import Agent_MCTS agent = Agent_MCTS(args, 5, 800, self_play, shared_lr_mul, shared_g_cnt) with lock: agent.model_update(share_model) from checkerboard import Checkerboard, BoardRender board = Checkerboard(board_max, args.n_rows) board_render = BoardRender(board_max, render_off=False, inline_draw=True) board.reset() board_render.clear() board_render.draw(board.states) p1, p2 = board.players player = input('select player 1: balck , 2 : white') if player == '1': play_step = 1 else: play_step = 0 for step in range(10000): if step // 2 % 2 == play_step: ss = input('input x,y:') pos = ss.split(',') if pos == 'q': return move = int(pos[0]) + int(pos[1]) * board_max print('movd ', move) else: move, move_probs = agent.get_action(board) board.step(move) board_render.draw(board.states) end, winner = board.game_end() if end: # winner from the perspective of the current player of each state agent.reset_player() if winner != -1: print("Game end. Winner is player:", winner) else: print("Game end. Tie") # return winner, zip(states, mcts_probs, winners_z) return
def run_process(args, share_model, board_max, rank): from checkerboard import Checkerboard env = Checkerboard(board_max, args.render) from agent import Agent_rainbow B_Agent = Agent_rainbow(args) W_Agent = Agent_rainbow(args) B_Agent.main_dqn = B_share_model W_Agent.main_dqn = W_share_model B_Agent.optimizer = optim.Adam(B_share_model.parameters(), lr=args.lr, eps=args.adam_eps) W_Agent.optimizer = optim.Adam(W_share_model.parameters(), lr=args.lr, eps=args.adam_eps) # from memory import PER_Memory # memory = PER_Memory(args) data_buffer = deque(maxlen=args.memory_capacity) """ main loop """ global_count = 0 episode = 0 W_Agent.target_dqn_update() B_Agent.target_dqn_update() W_Agent.train() B_Agent.train() Ts = [] Trewards = [] TQmax = [] while episode < args.max_episode_length: random.seed(time.time()) T = 0 turn = 0 max_action_value = -999999999999999 state = env.reset() evaluation = False total_reward = 0 if episode % args.evaluation_interval == 0: evaluation = True # args.epsilon -= 0.8/args.max_episode_length while T < args.max_step: action_value = -999999999999999 if T % 2 == 0: Agent_ptr = B_Agent turn = env.black else: Agent_ptr = W_Agent turn = env.white if not evaluation and (random.random() <= args.epsilon or global_count < args.learn_start): action = env.get_random_xy_flat() else: action, action_value = Agent_ptr.get_action(state) max_action_value = max(max_action_value, action_value) next_state, reward, done, _ = env.step_flat(action, turn) total_reward += reward memory.push(td_error, [state, action, reward, next_state, done]) state = next_state # replay_interval, target_update_interval only used odd number if not evaluation and global_count % args.replay_interval == 0 and global_count > args.learn_start: Agent_ptr.learn(memory) Agent_ptr.reset_noise() if not evaluation and global_count % args.target_update_interval == 0: Agent_ptr.target_dqn_update() T += 1 global_count += 1 if done: B_Agent.reset_noise() W_Agent.reset_noise() if args.render: env.render() break if evaluation: print('episode : ', episode, ' step : ', T, ' max_action ', max_action_value, 'total_reward : ', total_reward) Ts.append(episode) Trewards.append([total_reward]) TQmax.append([max_action_value]) _plot_line(Ts, Trewards, 'rewards_' + args.name + '_' + str(rank), path='results') _plot_line(Ts, TQmax, 'Q_' + args.name + '_' + str(rank), path='results') if episode % args.save_interval == 0: print('save') B_Agent.save('B' + args.name) W_Agent.save('W' + args.name) episode += 1
def act_process(args, share_model, rank, self_play, shared_lr_mul, shared_g_cnt, shared_q, lock): print(rank) board_max = args.board_max from agent import Agent_MCTS agent = Agent_MCTS(args, 5, 100, self_play, shared_lr_mul, shared_g_cnt) from checkerboard import Checkerboard, BoardRender board = Checkerboard(board_max, args.n_rows) board_render = BoardRender(board_max, render_off=True, inline_draw=False) board_render.clear() Ts = [] Tloss = [] Tentropy = [] try: for episode in range(10000): start_time = time.time() with lock: agent.model_update(share_model) random.seed(time.time()) board.reset() board_render.clear() board_render.draw(board.states) """ start a self-play game using a MCTS player, reuse the search tree store the self-play data: (state, mcts_probs, z) """ p1, p2 = board.players states, mcts_probs, current_players = [], [], [] # list_loss = [] # list_entropy = [] for step in range(10000): move, move_probs = agent.get_action(board, temp=1.0) # store the data states.append(board.current_state()) mcts_probs.append(move_probs) current_players.append(board.current_player) # perform a move board.step(move) board_render.draw(board.states) end, winner = board.game_end() if end: # time.sleep(1) # winner from the perspective of the current player of each state winners_z = np.zeros(len(current_players)) if winner != -1: winners_z[np.array(current_players) == winner] = 1.0 winners_z[np.array(current_players) != winner] = -1.0 #reset MCTS root node agent.reset_player() if winner != -1: print(rank, "Game end. Winner is player:", winner, 'total_step :', step, 'time:', time.time() - start_time) else: print(rank, "Game end. Tie", 'total_step :', step, 'time:', time.time() - start_time) # return winner, zip(states, mcts_probs, winners_z) play_data = zip(states, mcts_probs, winners_z) ex_play_data = get_equi_data(play_data, board_max, board_max) shared_q.put(ex_play_data) break # # plot_data # if len(data_buffer) > args.batch_size and len(list_loss)!=0: # # Ts.append(episode) # Tloss.append(list_loss) # Tentropy.append(list_entropy) # _plot_line(Ts, Tloss, 'loss', path='./') # _plot_line(Ts, Tentropy, 'entropy', path='./') episode += 1 except: print(rank, 'except end')