def _set_call_matrix(self, board): street = card_tools.board_to_street(board) self.equity_matrix = arguments.Tensor( game_settings.card_count, game_settings.card_count).zero_() if street == 0: #iterate through all possible next round streetss next_round_boards = card_tools.get_second_round_boards() boards_count = next_round_boards.size(0) next_round_equity_matrix = arguments.Tensor( game_settings.card_count, game_settings.card_count) for board in range(boards_count): self.get_last_round_call_matrix(next_round_boards[board], next_round_equity_matrix) self.equity_matrix.add_(next_round_equity_matrix) #averaging the values in the call matrix weight_constant = game_settings.board_card_count == 1 and 1 / ( game_settings.card_count - 2) or 2 / ( (game_settings.card_count - 2) * (game_settings.card_count - 3)) self.equity_matrix.mul_(weight_constant) elif street == 1: #for last round we just return the matrix self.get_last_round_call_matrix(board, self.equity_matrix) else: #impossible street assert (False) #, 'impossible street');
def _get_terminal_value(self, state, player): # oppo take the action and lead to this terminal node node = state.node assert (node.terminal) value = arguments.Tensor([0, 0]) if node.type == constants.node_types.terminal_fold: #ternimal fold value[node.current_player] = node.bets.sum() elif node.type == constants.node_types.terminal_call: # show down player_hand = arguments.Tensor( state.private[node.current_player].tolist() + node.board.tolist()) player_strength = evaluator.evaluate(player_hand, -1) oppo_hand = arguments.Tensor( state.private[1 - node.current_player].tolist() + node.board.tolist()) oppo_strength = evaluator.evaluate(oppo_hand, -1) # the one take call lose if player_strength < oppo_strength: value[node.current_player] = node.bets.sum() elif player_strength > oppo_strength: value[1 - node.current_player] = node.bets.sum() else: value = node.bets.clone() else: assert (False) # not a vaild terminal node return value[player]
def dfs_fill_table(node, table, dqnmodel, builder): if node.terminal: return # if node.current_player == constants.players.chance: # node.table = arguments.Tensor([]) # node.rl = arguments.Tensor([]) # children = node.children # for child in children: # dfs_fill_table(child,table, dqnmodel, builder) # return # sl all_table = table[node.node_id, :, :] # print(node.node_id) for i in range(all_table.size(0)): all_table[i, :] = all_table[i, :] / all_table[i, :].sum() node.table = all_table #rl for i in range(game_settings.card_count): state = GameState() state.node = node state.private = [arguments.Tensor([i]), arguments.Tensor([i])] state_tensor = builder.statenode_to_tensor(state) node.rl = torch.cat( (node.rl, dqnmodel(Variable(state_tensor, volatile=True)).data), 0) children = node.children for child in children: dfs_fill_table(child, table, dqnmodel, builder)
def dfs_fill_strategy(self, agent_sl, node, builder): if node.terminal: return if node.current_player == constants.players.chance: node.table = arguments.Tensor([]) node.rl = arguments.Tensor([]) children = node.children for child in children: self.dfs_fill_strategy(agent_sl, child, builder) return #sl for card in range(game_settings.card_count): state = GameState() for player in range(game_settings.player_count): state.private.append(arguments.Tensor([card])) state.node = node tensor = builder.statenode_to_tensor(state) strategy = agent_sl.model(Variable(tensor)).data[0][0:len(node.children)] if isinstance(agent_sl, DQNOptim): # print(strategy) max_ix = strategy.lt(strategy.max()) strategy[max_ix] = 0.0001 strategy[1-max_ix] = 1 strategy.div_(strategy.sum()) node.strategy[:,card] = strategy children = node.children for child in children: self.dfs_fill_strategy(agent_sl, child, builder)
def dfs_fill_table(self, node, table, builder): if node.terminal: return if node.current_player == constants.players.chance: node.table = arguments.Tensor([]) node.rl = arguments.Tensor([]) children = node.children for child in children: self.dfs_fill_table(child,table, builder) return # sl all_table = table[node.node_id,:,0:len(node.children)] node.table = torch.transpose(all_table.clone(),0,1) # print(node.node_id) for i in range(node.table.size(1)): node.table[:,i].div_(node.table[:,i].sum()) node.strategy = node.table.clone() # print(node.strategy) children = node.children for child in children: self.dfs_fill_table(child,table, builder)
def state2tensor(self, state): if state is None: return None # print(state.action_string) # transform street [0,1] means the first street # 4 /32 0-31 street_tensor = arguments.Tensor(constants.streets_count * 4).fill_(0) street_tensor[int(state.street) * 4:int(state.street + 1) * 4] = 1 # position_tensor # /48 32-80 position_tensor = arguments.Tensor(game_settings.player_count * 4).fill_(0) position_tensor[state.current_player * 4:(state.current_player + 1) * 4] = 1 # active tensor / 6 81-86 active_tensor = arguments.Tensor(game_settings.player_count) active_tensor[state.active] = 1 # transform bets 60 87-146 bet_tensor = arguments.Tensor(arguments.bet_bucket * game_settings.player_count).fill_(0) for i in range(game_settings.player_count): bet_tensor[i * arguments.bet_bucket + int((state.bets[i] - 1) / arguments.bet_bucket_len)] = 1 # ransform pot 60 87-146 pot_size = state.bets.max().item() pot_tensor = arguments.Tensor( len(arguments.pot_times) * game_settings.player_count).fill_(0) for i in range(game_settings.player_count): for j in range(len(arguments.pot_times)): if state.bets[i] < arguments.pot_times[j] * pot_size: pot_tensor[i * len(arguments.pot_times) + j] = 1 break # print(node.bets) # print(bet_player_tensor) # print(bet_oppo_tensor) # transform hand(private and board) 52 # print(len(state.private)) 52 private_tensor = self._cards_to_tensor( state.hole[state.current_player]) board_tensor = self._cards_to_tensor(state.board) #transform hand strengen # street: 1-2 position 3 bets 4-5 private return_tensor = torch.unsqueeze( torch.cat(( street_tensor, position_tensor, active_tensor, bet_tensor, pot_tensor, private_tensor, board_tensor, ), 0), 0) return return_tensor
def statenode_to_tensor(self, state): # tensor = arguments.Tensor(constants.player_count, \ # constants.streets_count, \ # constants.raises_count, \ # constants.acions_count, \ # constants.card_count * 2).fill_(0) if (state == None): return torch.unsqueeze(arguments.Tensor(20), 0) # transform street [0,1] means the first street street_tensor = arguments.Tensor(constants.streets_count) street_tensor[state.node.street - 1] = 1 # transform #detpth# and bets bets_tensor = state.node.bets / arguments.stack # transform hand(private and board) assert (len(state.private) == 2) private_tensor = card_tools.hand_to_tensor( state.private[state.node.current_player]) board_tensor = card_tools.hand_to_tensor(state.node.board) return torch.unsqueeze( torch.cat( (street_tensor, bets_tensor, private_tensor, board_tensor), 0), 0)
def string_to_board(card_string): if card_string == '': return arguments.Tensor([]) return arguments.Tensor([string_to_card(card_string)])
def step(self, state, action, is_rl=False): pot_size = state.bets.sum() current_player = state.current_player current_bet = state.bets[current_player] vaild_action = self.get_vaild_action(state) action_taken = action # if action is invaild if action_taken >= len(vaild_action): action_taken = len(vaild_action) - 1 # print(action) action_tuple = vaild_action[action_taken] # copy the current state, may be slow # print(state.action_string) next_state = copy.deepcopy(state) state.next = next_state next_state.prev = state next_state.do_action(action_tuple) reward = arguments.Tensor([ current_bet - next_state.bets[current_player] ]) if not self.distributed else arguments.Tensor([0]) terminal = next_state.terminal # TODO !!!!! here we store action not action_taken # self.store_memory(current_player, state, action, next_state, reward) action[0][0] = action_taken if is_rl: self.store_memory(current_player, state, action, reward) # assert(reward[0] < 10 and reward[0] > -10) # only for debug # self.store_memory(current_player, state, action_tuple, next_state, reward) if next_state.terminal: terminal_value = next_state.get_terminal_value() for record in self.memory: if len(record) > 0: record_player = record[-1].state.current_player if self.distributed: record[-1].reward.add_(terminal_value[record_player] - next_state.bets[record_player]) else: record[-1].reward.add_(terminal_value[record_player]) # fix the small and big bind if len(self.memory[0]) > 0 and len( self.memory[1]) > 0 and not self.distributed: self.memory[0][-1].reward.sub_(50) self.memory[1][-1].reward.sub_(100) # self.memory[0][-1].reward.sub_(0.3) # self.memory[1][-1].reward.sub_(0.6) next_state = None return next_state, terminal, action_taken
def get_possible_hand_indexes(self, board): out = arguments.Tensor(game_settings.card_count).fill_(0) if board.dim() == 0: out.fill_(1) return out whole_hand = arguments.Tensor(board.size(0) + 1) #mjb the frount is the board cards whole_hand[0:-1].copy_(board) for card in range(game_settings.card_count): whole_hand[-1] = card if self.hand_is_possible(whole_hand): out[card] = 1 return out
def statenode_to_tensor(self, state): # tensor = arguments.Tensor(constants.player_count, \ # constants.streets_count, \ # constants.raises_count, \ # constants.acions_count, \ # constants.card_count * 2).fill_(0) if (state == None): return None node = state.node # transform street [0,1] means the first street street_tensor = arguments.Tensor(constants.streets_count).fill_(0) street_tensor[int(node.street)] = 1 #position_tensor position_tensor = arguments.Tensor(4).fill_(node.current_player) # transform #detpth# and bets bet_player_tensor = arguments.Tensor(arguments.bet_bucket).fill_(0) bet_player_tensor[int((node.bets[node.current_player] - 1) / arguments.bet_bucket_len)] = 1 bet_oppo_tensor = arguments.Tensor(arguments.bet_bucket).fill_(0) bet_oppo_tensor[int((node.bets[1 - node.current_player] - 1) / arguments.bet_bucket_len)] = 1 # print(node.bets) # print(bet_player_tensor) # print(bet_oppo_tensor) # transform hand(private and board) # print(len(state.private)) assert (len(state.private) == 2) private_tensor = card_tools.hand_to_tensor( arguments.Tensor(state.private[node.current_player].tolist())) board_tensor = card_tools.hand_to_tensor(node.board) #transform hand strengen # player_hand = arguments.Tensor(state.private[node.current_player].tolist() + node.board.tolist()) # evaluator = Evaluator() # player_strength = evaluator.evaluate(player_hand, -1) # strength_tensor = arguments.Tensor([player_strength]) # street: 1-2 position 3 bets 4-5 private return_tensor = torch.unsqueeze( torch.cat((street_tensor, position_tensor, bet_player_tensor, bet_oppo_tensor, private_tensor, board_tensor), 0), 0) # print("private:" + str(state.private[node.current_player])) # print("board:" + node.board_string) # print(return_tensor) return return_tensor
def get_possible_bets(self, node): current_player = node.current_player assert (current_player >= 0 and current_player < game_settings.player_count ) #, 'Wrong player for bet size computation') opponent_bet = node.bets.max() assert (node.bets[current_player] <= opponent_bet) #compute min possible raise size max_raise_size = arguments.stack - opponent_bet min_raise_size = opponent_bet - node.bets[current_player] min_raise_size = max(min_raise_size, arguments.ante) min_raise_size = min(max_raise_size, min_raise_size) if min_raise_size == 0: return arguments.Tensor() elif min_raise_size == max_raise_size: out = arguments.Tensor(1, game_settings.player_count) out[0] = node.bets.clone() out[0][current_player] = opponent_bet + min_raise_size return out else: #iterate through all bets and check if they are possible max_possible_bets_count = self.pot_fractions.size( 0) + 1 #we can always go allin # out = arguments.Tensor(max_possible_bets_count,game_settings.player_count).copy_(node.bets) out = arguments.Tensor(max_possible_bets_count, game_settings.player_count) for i in range(max_possible_bets_count): out[i] = node.bets.clone() #take pot size after opponent bet is called pot = opponent_bet * 2 used_bets_count = 0 #try all pot fractions bet and see if we can use them for i in range(self.pot_fractions.size(0)): raise_size = pot * self.pot_fractions[i] if raise_size >= min_raise_size and raise_size < max_raise_size: used_bets_count = used_bets_count + 1 out[used_bets_count - 1, current_player] = opponent_bet + raise_size #adding allin used_bets_count = used_bets_count + 1 assert (used_bets_count <= max_possible_bets_count) out[used_bets_count - 1, current_player] = opponent_bet + max_raise_size return out[0:used_bets_count, :]
def batch_eval(self, board, impossible_hand_value = -1): hand_values = arguments.Tensor(game_settings.card_count).fill_(-1) if board.dim() == 0: for hand in range(game_settings.card_count): hand_values[hand] = math.floor((hand -1 ) / game_settings.suit_count ) + 1 else: board_size = board.size(0) assert(board_size == 1 or board_size == 2)#, 'Incorrect board size for Leduc' ) whole_hand = arguments.Tensor(board_size + 1) whole_hand[0:-1].copy_(board) for card in range(game_settings.card_count): whole_hand[-1] = card; hand_values[card] = self.evaluate(whole_hand, impossible_hand_value) return hand_values
def parsed_state_to_nodestate(self, processed_state): node = Node() node.street = processed_state['current_street'] node.board = card_to_string.string_to_board(processed_state['board']) node.current_player = processed_state['acting_player'] node.bets = arguments.Tensor([processed_state['bet1'], processed_state['bet2']]) state = GameState() state.node = node #TODO mjb private card been hardcode state.private = [-1 for i in range(game_settings.player_count)] state.private[node.current_player] = arguments.Tensor([processed_state['hand_id']]) return state
def finish_episode(self, env_memory): self.model.train() self.steps_done += 1 policy_loss = [] for i_agnet in range(len(env_memory)): if len(env_memory[i_agnet]) == 0: continue env_reward = reinf_tran(*zip(*env_memory[i_agnet])).reward rewards = [] R = 0 for r in env_reward: R = r + arguments.gamma * R rewards.insert(0, R) rewards = arguments.Tensor(rewards) # rewards = (rewards - rewards.mean()) / (rewards.std().item() + np.finfo(np.float32).eps) rewards = rewards / arguments.stack for log_prob, reward in zip(self.model.saved_log_probs[i_agnet], rewards): policy_loss.append(-log_prob * reward) self.optimizer.zero_grad() policy_loss = torch.cat(policy_loss).sum() policy_loss.backward() for param in self.model.parameters(): param.grad.data.clamp_(0, 1) self.optimizer.step() for i_policy, _ in enumerate(self.model.saved_log_probs): self.model.saved_log_probs[i_policy] = []
def __init__(self): self.current_player = -2 self.node_type = "" self.type = "" self.street = -1 self.board = "" self.board_string = "" self.bets = arguments.Tensor(2) self.pot = 0 self.parent = Node self.children = [] self.terminal = False self.actions = [] self.bet_sizing = [] self.node_id = 0 self.strategy = arguments.Tensor([])
def build_tree(self, params): root = Node() #.current_player necessary stuff from the root_node not to touch the input root.street = params['root_node']['street'] root.bets = params['root_node']['bets'].clone() root.current_player = params['root_node']['current_player'] root.board = params['root_node']['board'].clone() root.board_string = card_to_string.cards_to_string(root.board) params['bet_sizing'] = params[ 'bet_sizing'] if 'bet_sizing' in params else BetSizing( arguments.Tensor(arguments.bet_sizing)) assert (params['bet_sizing']) self.bet_sizing = params['bet_sizing'] self.limit_to_street = params['limit_to_street'] self._build_tree_dfs(root) # mjb # strategy_filling = StrategyFilling() # strategy_filling.fill_uniform(root) return root
def _set_fold_matrix(self, board): self.fold_matrix = arguments.Tensor(game_settings.card_count, game_settings.card_count) self.fold_matrix.fill_(1) #setting cards that block each other to zero - exactly elements on diagonal in leduc variants self.fold_matrix.sub_( torch.eye(game_settings.card_count).type_as(self.fold_matrix)) self._handle_blocking_cards(self.fold_matrix, board)
def hand_to_tensor(self, hand): hand_tensor = arguments.Tensor(game_settings.card_count).fill_(0) if hand.dim() == 0: return hand_tensor for card in hand: hand_tensor[int(card)] = 1 return hand_tensor
def get_random_range(self, board, seed): seed = seed or torch.random() gen = torch.Generator() torch.manualSeed(gen, seed) out = torch.rand(gen, game_settings.card_count).typeAs(arguments.Tensor()) out.cmul(self.get_possible_hand_indexes(board)) out.div(out.sum()) return out
def get_second_round_boards(self): boards_count = self.get_boards_count() if game_settings.board_card_count == 1: out = arguments.Tensor(boards_count, 1) for card in range(game_settings.card_count): out[card, 0] = card return out elif game_settings.board_card_count == 2: out = arguments.Tensor(boards_count, 2) board_idx = 0 for card_1 in range(game_settings.card_count): for card_2 in range(card_1 + 1, game_settings.card_count): board_idx = board_idx + 1 out[board_idx, 0] = card_1 out[board_idx, 1] = card_2 assert (board_idx == boards_count) #, 'wrong boards count!') return out else: assert (False) #, 'unsupported board size' )
def main(): import time time_start = time.time() total_reward = 0.0 for i_episode in range(arguments.epoch_count): # choose policy 0-sl 1-rl flag = 0 if random.random() > arguments.eta else 1 # Initialize the environment and state env.reset() state = env.state for t in count(): state_tensor = builder.statenode_to_tensor(state) # Select and perform an action assert (state_tensor.size(1) == 20) if flag == 0: # sl action = table_sl.select_action(state) else: #rl action = dqn_optim.select_action(state_tensor) next_state, reward, done = env.step(state, int(action[0][0])) # transform to tensor next_state_tensor = builder.statenode_to_tensor(next_state) reward_tensor = arguments.Tensor([reward]) action_tensor = action # Store the transition in reforcement learning memory Mrl dqn_optim.memory.push(state_tensor, action_tensor, next_state_tensor, reward_tensor) if flag == 1: # if choose sl store tuple(s,a) in supervised learning memory Msl table_sl.store(state, action) # Perform one step of the optimization (on the target network) dqn_optim.optimize_model() # Move to the next state state = next_state #accumlate the reward total_reward = total_reward + reward if done: dqn_optim.episode_durations.append(t + 1) # dqn_optim.plot_durations() break print('Complete') print((time.time() - time_start)) print(total_reward / arguments.epoch_count)
def test(self, table_sl): builder = PokerTreeBuilder() params = {} params['root_node'] = {} params['root_node']['board'] = card_to_string.string_to_board('') params['root_node']['street'] = 0 params['root_node']['current_player'] = constants.players.P1 params['root_node']['bets'] = arguments.Tensor([100, 100]) params['limit_to_street'] = False tree = builder.build_tree(params) # table_sl = torch.load('/home/mjb/Nutstore/deepStack/Data/Model/Iter:' + str(model_num) + '.sl') #constract the starting range filling = StrategyFilling() range1 = card_tools.get_uniform_range(params['root_node']['board']) range2 = card_tools.get_uniform_range(params['root_node']['board']) filling.fill_uniform(tree) starting_ranges = arguments.Tensor(game_settings.player_count, game_settings.card_count) starting_ranges[0].copy_(range1) starting_ranges[1].copy_(range2) table_sl.model.eval() # self.dfs_fill_table(tree, table_sl,builder) self.dfs_fill_strategy(table_sl,tree, builder) tree_values = TreeValues() tree_values.compute_values(tree, starting_ranges) print('Exploitability: ' + str(tree.exploitability.item()) + '[chips]' ) return tree.exploitability.item()
def _fill_uniformly(self, node): assert (node.current_player == constants.players.P1 or node.current_player == constants.players.P2) if (node.terminal): return # assert(node.current_player >= 0 ) node.strategy = arguments.Tensor(len(node.children), game_settings.card_count).fill_( 1.0 / len(node.children))
def __init__(self): params = {} params['root_node'] = {} params['root_node']['board'] = card_to_string.string_to_board('') params['root_node']['street'] = 0 params['root_node']['current_player'] = constants.players.P1 params['root_node']['bets'] = arguments.Tensor([100, 100]) params['limit_to_street'] = False builder = PokerTreeBuilder() self.root_node = builder.build_tree(params) # print(self.builder.node_id_acc) filling.fill_uniform(self.root_node) self.state = GameState() self._cached_terminal_equities = {}
def generate_cards( count ): #marking all used cards used_cards = torch.ByteTensor(game_settings.card_count).zero_() out = arguments.Tensor(count) #counter for generated cards generated_cards_count = 0 while(generated_cards_count < count): card = random.randint(0, game_settings.card_count - 1) if ( used_cards[card] == 0 ): out[generated_cards_count] = card generated_cards_count = generated_cards_count + 1 used_cards[card] = 1 return out
def _init_board_index_table(self): if game_settings.board_card_count == 1: self._board_index_table = torch.arange( 0, game_settings.card_count).float() elif game_settings.board_card_count == 2: self._board_index_table = arguments.Tensor( game_settings.card_count, game_settings.card_count).fill_(-1) board_idx = 0 for card_1 in range(game_settings.card_count): for card_2 in range(card_1 + 1, game_settings.card_count): board_idx = board_idx + 1 self._board_index_table[card_1][card_2] = board_idx self._board_index_table[card_2][card_1] = board_idx else: assert (False) #, 'unsupported board size')
def compute_action(self, state): # convert tensor for rl builder = PokerTreeBuilder() state_tensor = builder.statenode_to_tensor(state) # !!!! the return action is a longTensor[[]] # action_id = (self.table_sl.select_action(state) if random.random() > arguments.eta \ # else self.dqn_optim.select_action(state_tensor))[0][0] # action_id = self.table_sl.select_action(state)[0][0] action_id = self.net_sl[state.node.current_player].select_action( state_tensor).item() # print('_____________') # print(action_id) # print('_____________') # action['action: ,'raise_amount': ] action = {} #fold if action_id == 0: action['action'] = constants.acpc_actions.fold # call elif action_id == 1 or action_id >= game_settings.actions_count: action['action'] = constants.acpc_actions.ccall #raise elif action_id > 1: # get possible to determine the raising size bet_sizding = BetSizing(arguments.Tensor(arguments.bet_sizing)) possible_bets = bet_sizding.get_possible_bets(state.node) if possible_bets.dim() != 0: possible_bet = possible_bets[:, state.node.current_player] else: action['action'] = constants.acpc_actions.ccall return action raise_action_id = action_id - 2 # to override fold and call action # node possible bet in this state so call action['action'] = constants.acpc_actions.rraise if (len(possible_bet) <= raise_action_id): action['raise_amount'] = possible_bet[len(possible_bet) - 1].item() else: action['raise_amount'] = possible_bet[raise_action_id].item( ) # to override fold and call action else: assert (False) #invaild actions return action
def _fill_chance(self, node): assert (not node.terminal) #filling strategy #we will fill strategy with an uniform probability, but it has to be zero for hands that are not possible on #corresponding board node.strategy = arguments.Tensor(len(node.children), game_settings.card_count).fill_(0) #setting probability of impossible hands to 0 for i in range(len(node.children)): child_node = node.children[i] mask = card_tools.get_possible_hand_indexes( child_node.board).byte() node.strategy[i].fill_(0) #remove 2 because each player holds one card node.strategy[i][mask] = 1.0 / (game_settings.card_count - 2)
def _get_terminal_value(self, state): node = state.node assert (node.ternimal) value = arguments.Tensor(2).fill_(-1) value[node.current_player] = 1 if node.node_typee == constants.node_types.terminal_fold: #ternimal fold value.mul(node.bets[1 - node.current_player]) else: # show down player_hand = self.private[ node.current_player].tolist() + node.board.tolist() player_strength = evaluator.evaluate(player_hand, -1) oppo_hand = self.private[ 1 - node.current_player].tolist() + node.board.tolist() oppo_strength = evaluator.evaluate(oppo_hand, -1) if player_strength > oppo_strength: value.mul(node.bets[1 - node.current_player]) else: value.mul(-node.bets[1 - node.current_player]) return value