Beispiel #1
0
env.add_player(3, stack=20000)  # add a player to seat 2 with 2000 "chips"
card_dictionary = get_card_dict()

for epoch in range(num_versions):
    rl_dir = checkpoint_dir + '/rl_checkpoints/checkpoint_' + str(epoch +
                                                                  1) + '000.pt'
    sl_dir = checkpoint_dir + '/sl_checkpoints/checkpoint_' + str(epoch +
                                                                  1) + '000.pt'
    # policy_net = DQN(num_player=num_player,big=False).to(device)
    policy_net = DQN_limit(num_player=num_player,
                           big=big,
                           num_action=3,
                           hidden_units=hid).to(device)
    # checkpoint = torch.load('../holdem_result/rl_checkpoints/checkpoint1999999_8235201.000.pt')
    checkpoint = torch.load(rl_dir, map_location='cpu')
    policy_net.load_state_dict(checkpoint['model'])
    policy_net.eval()
    sl_net = MLP_limit(num_player=num_player,
                       big=big,
                       num_action=3,
                       hidden_units=hid).to(device)
    # sl_net = MLP(num_player=num_player, big=False).to(device)
    checkpoint = torch.load(sl_dir, map_location='cpu')
    # checkpoint = torch.load('../holdem_result/sl_checkpoints/checkpoint1999999_8235201.000.pt')
    sl_net.load_state_dict(checkpoint['model'])
    sl_net.eval()

    results_1 = []
    results_2 = []
    for expriment in range(3):
Beispiel #2
0
# networks

# networks
policy_net = DQN_limit(num_player=num_player,
                       big=big,
                       res_net=use_res_net,
                       num_layer=num_layer,
                       num_action=3,
                       hidden_units=num_hid).to(device)
target_net = DQN_limit(num_player=num_player,
                       big=big,
                       res_net=use_res_net,
                       num_layer=num_layer,
                       num_action=3,
                       hidden_units=num_hid).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
sl_net = MLP_limit(num_player=num_player,
                   big=big,
                   res_net=use_res_net,
                   num_layer=num_layer,
                   num_action=3,
                   hidden_units=num_hid).to(device)

# optimizer
rl_optimizer = optim.SGD(policy_net.parameters(),
                         lr=lr_RL,
                         weight_decay=weight_decay)
sl_optimizer = optim.Adam(sl_net.parameters(),
                          lr=lr_SL,
                          weight_decay=weight_decay)