def test_simple_decode(self): g = game.decode_binary(0b000000000000000000000000000000000000000000110110110110110110110) self.assertEqual(g, [[]]*7) g = game.decode_binary(0b111111111111111111111111111111111111111111000000000000000000000) self.assertEqual(g, [[1]*6]*7) g = game.decode_binary(0) self.assertEqual(g, [[0]*6]*7)
def test_simple_decode(self): g = game.decode_binary( 0b000000000000000000000000000000000000000000110110110110110110110) self.assertEqual(g, [[]] * 7) g = game.decode_binary( 0b111111111111111111111111111111111111111111000000000000000000000) self.assertEqual(g, [[1] * 6] * 7) g = game.decode_binary(0) self.assertEqual(g, [[0] * 6] * 7)
def search_minibatch(self, count, state_int, player, net, device="cpu"): """ Perform several MCTS searches. """ backup_queue = [] expand_states = [] expand_players = [] expand_queue = [] planned = set() for _ in range(count): value, leaf_state, leaf_player, states, actions = \ self.find_leaf(state_int, player) if value is not None: backup_queue.append((value, states, actions)) else: if leaf_state not in planned: planned.add(leaf_state) leaf_state_lists = game.decode_binary( leaf_state) expand_states.append(leaf_state_lists) expand_players.append(leaf_player) expand_queue.append((leaf_state, states, actions)) # do expansion of nodes if expand_queue: batch_v = model.state_lists_to_batch( expand_states, expand_players, device) logits_v, values_v = net(batch_v) probs_v = F.softmax(logits_v, dim=1) values = values_v.data.cpu().numpy()[:, 0] probs = probs_v.data.cpu().numpy() # create the nodes for (leaf_state, states, actions), value, prob in \ zip(expand_queue, values, probs): self.visit_count[leaf_state] = [0]*game.GAME_COLS self.value[leaf_state] = [0.0]*game.GAME_COLS self.value_avg[leaf_state] = [0.0]*game.GAME_COLS self.probs[leaf_state] = prob backup_queue.append((value, states, actions)) # perform backup of the searches for value, states, actions in backup_queue: # leaf state is not stored in states and actions, so the value of the leaf will be the value of the opponent cur_value = -value for state_int, action in zip(states[::-1], actions[::-1]): self.visit_count[state_int][action] += 1 self.value[state_int][action] += cur_value self.value_avg[state_int][action] = \ self.value[state_int][action] / \ self.visit_count[state_int][action] cur_value = -cur_value
def test_move_horizontal_win(self): f = game.encode_lists([[]] * 7) f, won = game.move(f, 0, 1) self.assertFalse(won) l = game.decode_binary(f) self.assertEqual(l, [[1]] + [[]] * 6) f, won = game.move(f, 1, 1) self.assertFalse(won) l = game.decode_binary(f) self.assertEqual(l, [[1], [1]] + [[]] * 5) f, won = game.move(f, 3, 1) self.assertFalse(won) l = game.decode_binary(f) self.assertEqual(l, [[1], [1], [], [1], [], [], []]) f, won = game.move(f, 2, 1) self.assertTrue(won) l = game.decode_binary(f) self.assertEqual(l, [[1], [1], [1], [1], [], [], []])
def test_move_vertical_win(self): f = game.encode_lists([[]] * 7) f, won = game.move(f, 0, 1) self.assertFalse(won) l = game.decode_binary(f) self.assertEqual(l, [[1]] + [[]] * 6) f, won = game.move(f, 0, 1) self.assertFalse(won) l = game.decode_binary(f) self.assertEqual(l, [[1, 1]] + [[]] * 6) f, won = game.move(f, 0, 1) self.assertFalse(won) l = game.decode_binary(f) self.assertEqual(l, [[1, 1, 1]] + [[]] * 6) f, won = game.move(f, 0, 1) self.assertTrue(won) l = game.decode_binary(f) self.assertEqual(l, [[1, 1, 1, 1]] + [[]] * 6)
def test_move_horizontal_win(self): f = game.encode_lists([[]]*7) f, won = game.move(f, 0, 1) self.assertFalse(won) l = game.decode_binary(f) self.assertEqual(l, [[1]] + [[]]*6) f, won = game.move(f, 1, 1) self.assertFalse(won) l = game.decode_binary(f) self.assertEqual(l, [[1], [1]] + [[]]*5) f, won = game.move(f, 3, 1) self.assertFalse(won) l = game.decode_binary(f) self.assertEqual(l, [[1], [1], [], [1], [], [], []]) f, won = game.move(f, 2, 1) self.assertTrue(won) l = game.decode_binary(f) self.assertEqual(l, [[1], [1], [1], [1], [], [], []])
def test_move_vertical_win(self): f = game.encode_lists([[]]*7) f, won = game.move(f, 0, 1) self.assertFalse(won) l = game.decode_binary(f) self.assertEqual(l, [[1]] + [[]]*6) f, won = game.move(f, 0, 1) self.assertFalse(won) l = game.decode_binary(f) self.assertEqual(l, [[1, 1]] + [[]]*6) f, won = game.move(f, 0, 1) self.assertFalse(won) l = game.decode_binary(f) self.assertEqual(l, [[1, 1, 1]] + [[]]*6) f, won = game.move(f, 0, 1) self.assertTrue(won) l = game.decode_binary(f) self.assertEqual(l, [[1, 1, 1, 1]] + [[]]*6)
def search_minibatch(self, count, state_int, player, net, step, device="cpu"): backup_queue = [] expand_states = [] expand_steps = [] expand_queue = [] planned = set() for _ in range(count): value, leaf_state, leaf_step, states, actions = self.find_leaf( state_int, player, step) if value is not None: backup_queue.append((value, states, actions)) else: if leaf_state not in planned: planned.add(leaf_state) leaf_state_lists = game.decode_binary(leaf_state) expand_states.append(leaf_state_lists) expand_steps.append(leaf_step) expand_queue.append((leaf_state, states, actions)) if expand_queue: batch_v = model.state_lists_to_batch(expand_states, expand_steps, device) logits_v, values_v = net(batch_v) probs_v = F.softmax(logits_v, dim=1) values = values_v.data.cpu().numpy()[:, 0] probs = probs_v.data.cpu().numpy() for (leaf_state, states, actions), value, prob in zip(expand_queue, values, probs): self.visit_count[leaf_state] = [0] * actionTable.AllMoveLength self.value[leaf_state] = [0.0] * actionTable.AllMoveLength self.value_avg[leaf_state] = [0.0] * actionTable.AllMoveLength self.probs[leaf_state] = prob backup_queue.append((value, states, actions)) for value, states, actions in backup_queue: cur_value = -value for state_int, action in zip(states[::-1], actions[::-1]): self.visit_count[state_int][action] += 1 self.value[state_int][action] += cur_value self.value_avg[state_int][action] =\ self.value[state_int][action] / self.visit_count[state_int][action] cur_value = -cur_value
def render(pan_str, player_human): pan = game.decode_binary(pan_str) print(" 1 2 3 4 5 6 7 8 9") for y in range(10): s = chr((10-y if y>0 else 0)+ord('0'))+" " for x in range(9): a = pan[y if player_human>0 else 9-y][x if player_human>0 else 8-x] s += piece_str[a // 10 * 7 + a % 10 - 1] + ('-' if x < 8 else ' ') if a > 0 else \ (('┌' if x < 1 else '┬' if x < 8 else '┐') if y < 1 else \ ('├' if x < 1 else '╋' if x < 8 else '┤') if y < 9 else \ ('└' if x < 1 else '┴' if x < 8 else '┘')) + ('─-' if x < 8 else ' ') print(s) if y<9: print(" │ │ │ │\│/│ │ │ │ " if y<1 or y==7 else " │ │ │ │/│\│ │ │ │ "\ if y==1 or y>7 else " │ │ │ │ │ │ │ │ │ ")
def search_minibatch(self, count, state_int, player, net, device="cpu"): """ Perform several MCTS searches. """ backup_queue = [] expand_states = [] expand_players = [] expand_queue = [] planned = set() for _ in range(count): value, leaf_state, leaf_player, states, actions = self.find_leaf(state_int, player) if value is not None: backup_queue.append((value, states, actions)) else: if leaf_state not in planned: planned.add(leaf_state) leaf_state_lists = game.decode_binary(leaf_state) expand_states.append(leaf_state_lists) expand_players.append(leaf_player) expand_queue.append((leaf_state, states, actions)) # do expansion of nodes if expand_queue: batch_v = model.state_lists_to_batch(expand_states, expand_players, device) logits_v, values_v = net(batch_v) probs_v = F.softmax(logits_v, dim=1) values = values_v.data.cpu().numpy()[:, 0] probs = probs_v.data.cpu().numpy() # create the nodes for (leaf_state, states, actions), value, prob in zip(expand_queue, values, probs): self.visit_count[leaf_state] = [0] * game.GAME_COLS self.value[leaf_state] = [0.0] * game.GAME_COLS self.value_avg[leaf_state] = [0.0] * game.GAME_COLS self.probs[leaf_state] = prob backup_queue.append((value, states, actions)) # perform backup of the searches for value, states, actions in backup_queue: # leaf state is not stored in states and actions, so the value of the leaf will be the value of the opponent cur_value = -value for state_int, action in zip(states[::-1], actions[::-1]): self.visit_count[state_int][action] += 1 self.value[state_int][action] += cur_value self.value_avg[state_int][action] = self.value[state_int][action] / self.visit_count[state_int][action] cur_value = -cur_value
print("Step %d, steps %3d, leaves %4d, steps/s %5.2f, leaves/s %6.2f, best_idx %d, replay %d" % ( step_idx, game_steps, game_nodes, speed_steps, speed_nodes, best_idx, len(replay_buffer))) step_idx += 1 if len(replay_buffer) < MIN_REPLAY_TO_TRAIN: continue # train sum_loss = 0.0 sum_value_loss = 0.0 sum_policy_loss = 0.0 for _ in range(TRAIN_ROUNDS): batch = random.sample(replay_buffer, BATCH_SIZE) batch_states, batch_who_moves, batch_probs, batch_values = zip(*batch) batch_states_lists = [game.decode_binary(state) for state in batch_states] states_v = model.state_lists_to_batch(batch_states_lists, batch_who_moves, device) optimizer.zero_grad() probs_v = torch.FloatTensor(batch_probs).to(device) values_v = torch.FloatTensor(batch_values).to(device) out_logits_v, out_values_v = net(states_v) loss_value_v = F.mse_loss(out_values_v.squeeze(-1), values_v) loss_policy_v = -F.log_softmax(out_logits_v, dim=1) * probs_v loss_policy_v = loss_policy_v.sum(dim=1).mean() loss_v = loss_policy_v + loss_value_v loss_v.backward() optimizer.step() sum_loss += loss_v.item()
def play_game(net1, steps_before_tau_0, mcts_batch_size, device="cpu"): assert isinstance(net1, model.Net) assert isinstance(steps_before_tau_0, int) and steps_before_tau_0 >= 0 assert isinstance(mcts_batch_size, int) and mcts_batch_size > 0 global mcts_searches pan = game.encode_lists([list(i) for i in game.INITIAL_STATE], 0) historystr = [] cur_player = 0 step = 0 mctsi = mcts.MCTS() result = None; exitf = False a0 = ord('1') while True: s=input('플레이하려는 진영을 선택하세요 0) 초, 1)한 ?') if s.find('level') >= 0: mcts_searches = LEVELC * int(s[6:]) print('OK', flush=True) else: player_human = 0 if int(s)<1 else 1 break while result is None: movelist = game.possible_moves(pan, cur_player, step) if step>9 and historystr[-4][:90]==historystr[-8][:90]: p = game.decode_binary(pan) for idx, m in enumerate(movelist): spos = m // 100; tpos = m % 100; y0 = spos // 9; x0 = spos % 9; y1 = tpos // 9; x1 = tpos % 9 captured = p[y1][x1]; p[y1][x1] = p[y0][x0]; p[y0][x0] = 0 ps = game.encode_lists(p, step+1) if ps[:90]==historystr[-4][:90]: del movelist[idx]; break p[y0][x0] = p[y1][x1]; p[y1][x1] = captured if (step<2 and cur_player != player_human) or (step>1 and cur_player == player_human): if step < 2: print("마상 차림을 선택하세요 0) "+masang[0]+", 1) "+masang[1]+", 2) "+masang[2]+", 3) "+masang[3]) else: render(pan, player_human) if step==2 or step==3: print("") print("옮기고자 하는 기물의 세로 번호, 가로 번호, 목적지의 세로 번호, 가로 번호 ex) 0010 한수 쉬기: 0") action = -1 while action<0: s=input((str(step-1) if step>1 else '')+' ? ') if s=="new": exitf=True; break elif s.find('level')>=0: mcts_searches=LEVELC*int(s[6:]); print('OK', flush=True) elif step<2: if len(s)==1 and s[0]>='0' and s[0]<'4': action = int(s) + 10000 elif len(s)==1: action = 0 elif s=='undo' and step>3: step-=2; historystr.pop(); historystr.pop(); pan=historystr[-1] movelist = game.possible_moves(pan, cur_player, step) render(pan, player_human) elif len(s)==4 and s[0]>='0' and s[0]<='9' and s[1]>'0' and s[1]<='9' and s[2]>='0' and s[2]<='9' and s[3]>'0' and s[3]<='9': b1=9-ord(s[0])+a0 if s[0]>'0' else 0 if player_human<1: b1=9-b1 b2 = ord(s[1]) - a0 if player_human < 1: b2 = 8 - b2 b3 = 9-ord(s[2]) + a0 if s[2]>'0' else 0 if player_human < 1: b3 = 9 - b3 b4 = ord(s[3]) - a0 if player_human < 1: b4 = 8 - b4 action = (b1*9 + b2)*100 + b3*9+b4 if action not in movelist: action = -1 else: print('OK', flush=True) else: mctsi.search_batch(mcts_searches, mcts_batch_size, pan, cur_player, net1, step, device=device) probs, values = mctsi.get_policy_value(pan, movelist, cur_player) chList = actionTable.choList if cur_player < 1 else actionTable.hanList n = np.random.choice(actionTable.AllMoveLength, p=probs) if step<steps_before_tau_0 else np.argmax(probs) action = chList[n] """for m in movelist: print('%04d %.2f' % (m, probs[chList.index(m)]), end=', ') print()""" if step<2: print(('한: ' if step<1 else '초: ')+masang[action-10000]+' '+str(values[n]), flush=True) if step==1: render(pan, player_human) else: if action<1: print('한수쉼'+' '+str(values[n])) else: b1=action//100//9 if player_human<1: b1=9-b1 b2 = action//100%9 if player_human < 1: b2 = 8 - b2 b3 = action%100//9 if player_human < 1: b3 = 9 - b3 b4 = action%100%9 if player_human < 1: b4 = 8 - b4 print((chr(9-b1+a0) if b1>0 else '0')+chr(b2+a0)+(chr(9-b3+a0) if b3>0 else '0')+chr(b4+a0)+' '+str(values[n])) if exitf: break pan, won = game.move(pan, action, step) historystr.append(pan) if won>0: render(pan, player_human) print(('초' if won==1 else '한')+' 승') break cur_player = 1-cur_player step += 1
step_idx += 1 if len(replay_buffer) < MIN_REPLAY_TO_TRAIN: continue # train sum_loss = 0.0 sum_value_loss = 0.0 sum_policy_loss = 0.0 for _ in range(TRAIN_ROUNDS): batch = random.sample(replay_buffer, BATCH_SIZE) batch_states, batch_who_moves, batch_probs, batch_values = zip( *batch) batch_states_lists = [ game.decode_binary(state) for state in batch_states ] states_v = model.state_lists_to_batch(batch_states_lists, batch_who_moves, args.cuda) optimizer.zero_grad() probs_v = Variable(torch.FloatTensor(batch_probs)) values_v = Variable(torch.FloatTensor(batch_values)) if args.cuda: probs_v = probs_v.cuda() values_v = values_v.cuda() out_logits_v, out_values_v = net(states_v) loss_value_v = F.mse_loss(out_values_v, values_v) loss_policy_v = -F.log_softmax(out_logits_v, dim=1) * probs_v