def test_reward_in_log_needs_to_accumulate(): p1 = Player("bob") p2 = Player("sharon") d = StandardDeck() # d = Deck(egg=15, salmon=15, squid=15, tempura=15, # sashimi=15, dumpling=15, pudding=0, # wasabi=15, maki1=10, maki2=10, maki3=10) g = Game(deck=d, agents=[p1, p2], cards_per_player=10, n_rounds=2) g.simulate_game() df = g.gamelog.sort_values(["player", "turn"]) for player in ["bob", "sharon", "alice"]: print(df[df['player'] == player]) p1_rewards = df[df['player'] == 'bob']['reward'] p2_rewards = df[df['player'] == 'sharon']['reward'] print(g.scores) assert all([_ >= 0 for _ in (p1_rewards - p1_rewards.shift().fillna(0))]) assert all([_ >= 0 for _ in (p2_rewards - p2_rewards.shift().fillna(0))])
#Set up policy policy = Policy('LSTM', 22, 20, 1, 11) torch.manual_seed(123) #Parameters gamma = 0.99 #Set up optim lr = 1e-2 optimizer = optim.Adam(policy.parameters(), lr=lr) log_interval = 10 #Play games deck = StandardDeck() N_cards = len(list(set([str(_) for _ in deck]))) p1 = Pg_player(policy=policy, name="PG_player01") p2 = Simple_player(weights=[1 / N_cards] * N_cards, name="SIMPLE_player01") ewma = 0.5 alpha = 0.95 for n in range(100): game = Game([p1, p2], verbose=False) game.simulate_game() win = game.did_player_win(p1.name) ewma = alpha * ewma + (1 - alpha) * int(win) print('At %3i ewma win ratio %5.3f' % (n, ewma)) finish_game(policy, gamma=gamma, optimizer=optimizer) p1.prev_reward = None optimizer = adjust_learning_rate(optimizer, n, lr, 30)