Beispiel #1
0
for epoch in range(20):

    # rl_dir2 = checkpoint_dir+'/rl_checkpoints/checkpoint_' + str(epoch+2) + '000.pt'
    # sl_dir1 = checkpoint_dir+'/sl_checkpoints/checkpoint_' + str(epoch+1) + '000.pt'
    # sl_dir2 = checkpoint_dir+'/sl_checkpoints/checkpoint_' + str(epoch+2) + '000.pt'

    policy_net = {}
    sl_net = {}
    rl_dir = {}
    sl_dir = {}
    for i in range(num_NFSP):
        rl_dir[i] = checkpoint_dir+'/rl_checkpoints/checkpoint_' + str(epoch+1+i) + '000.pt'
        sl_dir[i] = checkpoint_dir+'/sl_checkpoints/checkpoint_' + str(epoch+1+i) + '000.pt'
        policy_net[i] = DQN_limit(num_player=num_player,big=False, num_action = 3, hidden_units = hid).to(device)
        sl_net[i] = MLP_limit(num_player=num_player, big=False, num_action = 3, hidden_units = hid).to(device)

        checkpoint = torch.load(rl_dir[i], map_location='cpu')
        policy_net[i].load_state_dict(checkpoint['model'])
        policy_net[i].eval()

        checkpoint = torch.load(sl_dir[i], map_location='cpu')
        sl_net[i].load_state_dict(checkpoint['model'])
        sl_net[i].eval()

    results = []
    for expriment in range(3):

        game_board = {}
        sum_board = {}
Beispiel #2
0
for epoch in range(num_versions):
    rl_dir = checkpoint_dir + '/rl_checkpoints/checkpoint_' + str(epoch +
                                                                  1) + '000.pt'
    sl_dir = checkpoint_dir + '/sl_checkpoints/checkpoint_' + str(epoch +
                                                                  1) + '000.pt'
    # policy_net = DQN(num_player=num_player,big=False).to(device)
    policy_net = DQN_limit(num_player=num_player,
                           big=big,
                           num_action=3,
                           hidden_units=hid).to(device)
    # checkpoint = torch.load('../holdem_result/rl_checkpoints/checkpoint1999999_8235201.000.pt')
    checkpoint = torch.load(rl_dir, map_location='cpu')
    policy_net.load_state_dict(checkpoint['model'])
    policy_net.eval()
    sl_net = MLP_limit(num_player=num_player,
                       big=big,
                       num_action=3,
                       hidden_units=hid).to(device)
    # sl_net = MLP(num_player=num_player, big=False).to(device)
    checkpoint = torch.load(sl_dir, map_location='cpu')
    # checkpoint = torch.load('../holdem_result/sl_checkpoints/checkpoint1999999_8235201.000.pt')
    sl_net.load_state_dict(checkpoint['model'])
    sl_net.eval()

    results_1 = []
    results_2 = []
    for expriment in range(3):

        game_board = {}
        sum_board = {}

        nfsp_players = {}
Beispiel #3
0
                       big=big,
                       res_net=use_res_net,
                       num_layer=num_layer,
                       num_action=3,
                       hidden_units=num_hid).to(device)
target_net = DQN_limit(num_player=num_player,
                       big=big,
                       res_net=use_res_net,
                       num_layer=num_layer,
                       num_action=3,
                       hidden_units=num_hid).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
sl_net = MLP_limit(num_player=num_player,
                   big=big,
                   res_net=use_res_net,
                   num_layer=num_layer,
                   num_action=3,
                   hidden_units=num_hid).to(device)

# optimizer
rl_optimizer = optim.SGD(policy_net.parameters(),
                         lr=lr_RL,
                         weight_decay=weight_decay)
sl_optimizer = optim.Adam(sl_net.parameters(),
                          lr=lr_SL,
                          weight_decay=weight_decay)


def optimize_model():
    if len(M_rl) < BATCH_SIZE or len(M_sl) < BATCH_SIZE:
        return