Beispiel #1
0
def test_reward_in_log_needs_to_accumulate():
    p1 = Player("bob")
    p2 = Player("sharon")
    d = StandardDeck()
    # d = Deck(egg=15, salmon=15, squid=15, tempura=15,
    #          sashimi=15, dumpling=15, pudding=0,
    #          wasabi=15, maki1=10, maki2=10, maki3=10)
    g = Game(deck=d, agents=[p1, p2], cards_per_player=10, n_rounds=2)
    g.simulate_game()
    df = g.gamelog.sort_values(["player", "turn"])
    for player in ["bob", "sharon", "alice"]:
        print(df[df['player'] == player])
    p1_rewards = df[df['player'] == 'bob']['reward']
    p2_rewards = df[df['player'] == 'sharon']['reward']
    print(g.scores)
    assert all([_ >= 0 for _ in (p1_rewards - p1_rewards.shift().fillna(0))])
    assert all([_ >= 0 for _ in (p2_rewards - p2_rewards.shift().fillna(0))])
Beispiel #2
0
#Set up policy
policy = Policy('LSTM', 22, 20, 1, 11)
torch.manual_seed(123)

#Parameters
gamma = 0.99

#Set up optim
lr = 1e-2
optimizer = optim.Adam(policy.parameters(), lr=lr)
log_interval = 10

#Play games
deck = StandardDeck()
N_cards = len(list(set([str(_) for _ in deck])))
p1 = Pg_player(policy=policy, name="PG_player01")
p2 = Simple_player(weights=[1 / N_cards] * N_cards, name="SIMPLE_player01")

ewma = 0.5
alpha = 0.95
for n in range(100):
    game = Game([p1, p2], verbose=False)
    game.simulate_game()
    win = game.did_player_win(p1.name)
    ewma = alpha * ewma + (1 - alpha) * int(win)
    print('At %3i ewma win ratio %5.3f' % (n, ewma))

    finish_game(policy, gamma=gamma, optimizer=optimizer)
    p1.prev_reward = None
    optimizer = adjust_learning_rate(optimizer, n, lr, 30)