def main(): N = 4 g = Goofspiel(N, scoring=Goofspiel.Scoring.ZEROSUM) mc = OutcomeMCCFR(g, seed=56) its = 100.0 while its < 1000000: fname = "goof-{}-{}.strat".format(N, its) mc.persist(fname, iterations=iterations=int(its) - mc.iterations) its *= 2 ** 0.5 print("Exploitability after {:7d} turns (mc, g): {}, {}".format( int(its), exploitability(g, 0, mc), exploitability(g, 1, mc))) assert 0 vs = GoofSpielCardsValueStore(g) vl = SparseStochasticValueLearning(g, vs, seed=41) vals = np.concatenate([ vl.compute([mc, mc], 1000, alpha=0.01, store_step=1), vl.compute([mc, mc], 1000, alpha=0.001, store_step=1), vl.compute([mc, mc], 1000, alpha=0.0001, store_step=1), ], axis=0) plt.plot(vals) plt.show() print("Values:", vs.values) g2 = Goofspiel(N, scoring=Goofspiel.Scoring.ZEROSUM, rewards=vs.values) mc2 = OutcomeMCCFR(g2, seed=57) mc2.compute(iterations=ITERS) print("Exp(mc2, g2)", exploitability(g2, 0, mc2), exploitability(g2, 1, mc2)) print("Exp(mc2, g)", exploitability(g, 0, mc2), exploitability(g, 1, mc2))
def test_goofspiel(): g = Goofspiel(7, Goofspiel.Scoring.ZEROSUM) s = g.start() assert s.is_chance() assert s.actions == tuple(range(1, 8)) assert (s.chance == (pytest.approx(1 / 7), ) * 7) for i, a in enumerate([4, 2, 1, 5, 4, 6, 6, 3, 3, 2, 5, 4, 3, 7]): s = s.play(a) assert s.player == 1 assert s.actions == (2, 5, 7) assert s.observations[2] == (4, 1, 5, -1, 6, 0, 2, 1, 3) assert s.state[1] == pytest.approx([6, 5]) assert s.state[0][0] == (1, 6) assert s.state[0][1] == (2, 5, 7) assert s.state[0][2] == (1, 7) for a in [2, 7, 6, 7, 1, 1, 5]: s = s.play(a) assert s.is_terminal() assert s.state[1] == pytest.approx([9, 13]) assert s.payoff == pytest.approx([-4.0, 4.0])
def test_goofspiel(): g = Goofspiel(4, scoring=Goofspiel.Scoring.ZEROSUM) mc = OutcomeMCCFR(g, seed=42) mc.compute(100) vs = LinearValueStore(goofspiel_feaures_cards(g.initial_state()), fix_mean=2.5) infosampler = InformationSetSampler(g, mc) val = SparseSGDLinearValueLearning(g, goofspiel_feaures_cards, vs, infosampler, seed=43) val.compute([mc, mc], 100, 0.1, 0.01) print(vs.values)
def test_goofspiel_rewards(): us = [UniformStrategy(), UniformStrategy()] g = Goofspiel(2, Goofspiel.Scoring.ZEROSUM, rewards=[100, 11]) for i in range(50): s = play_strategies(g, us, seed=i) assert tuple(s.payoff) in ((0.0, 0.0), (-89.0, 89.0), (89.0, -89.0)) g = Goofspiel(2, Goofspiel.Scoring.ABSOLUTE, rewards=[100, 11]) for i in range(50): s = play_strategies(g, us, seed=i) assert tuple(s.payoff) in ((0.0, 0.0), (100.0, 11.0), (11.0, 100.0))
def main(): N = 4 ITERS = 2000000 g = Goofspiel(N, scoring=Goofspiel.Scoring.ZEROSUM) mc = OutcomeMCCFR(g, seed=56) fname = "goof-{}".format(N) its = 1024 while its < ITERS: cached = mc.persist(fname, iterations=its) if not cached: print("Exploitability after {:7d} turns (mc, g): {}, {}".format( its, exploitability(g, 0, mc), exploitability(g, 1, mc))) its *= 2 infosampler = InformationSetSampler(g, mc) vsts = (1, 3) gsts = (1, 3) ax0 = plt.subplot(len(vsts), len(gsts), 1) for i, (vst, gst) in enumerate(itertools.product(vsts, gsts)): vs = LinearValueStore(goofspiel_feaures_cards(g.initial_state()), fix_mean=(N + 1) / 2.0) vl = SparseSGDLinearValueLearning(g, goofspiel_feaures_cards, vs, infosampler, seed=44) vals = np.concatenate([ vl.compute([mc, mc], 1000, step=s, record_every=1, val_samples=vst, grad_samples=gst) for s in [2**-8, 2**-9, 2**-10, 2**-11] ], axis=0) #c = ['red', 'green', 'blue', 'black'][i] ax = plt.subplot(len(vsts), len(gsts), i + 1, sharex=ax0, sharey=ax0) ax.plot(vals) ax.legend(list(range(1, N + 1))) ax.set_title("valseps={} gradsteps={}".format(vst, gst)) print("Done sampling valseps={} gradsteps={}".format(vst, gst)) print("Values:", vs.values) plt.show() return g2 = Goofspiel(N, scoring=Goofspiel.Scoring.ZEROSUM, rewards=vs.values) mc2 = OutcomeMCCFR(g2, seed=57) mc2.compute(iterations=ITERS) print("Exp(mc2, g2)", exploitability(g2, 0, mc2), exploitability(g2, 1, mc2)) print("Exp(mc2, g)", exploitability(g, 0, mc2), exploitability(g, 1, mc2))
def test_mccfr_goofspiel3(): g = Goofspiel(3, scoring=Goofspiel.Scoring.ZEROSUM) mc = OutcomeMCCFR(g, seed=51) mc.compute(600, burn=0.5) mcs = mc.strategies us = UniformStrategy() s1 = g.play_sequence([2]) assert mcs[0].strategy(s1) == pytest.approx([0., 0.9, 0.], abs=0.1) assert sample_payoff(g, mcs, 300, seed=12)[0] == pytest.approx([0.0, 0.0], abs=0.1) assert sample_payoff(g, (mcs[0], us), 300, seed=13)[0] == pytest.approx([1.2, -1.2], abs=0.2) assert exploitability(g, 0, mcs[0]) < 0.1 assert exploitability(g, 1, mcs[1]) < 0.1
def test_dump_gambit_game(): g = Goofspiel(3, scoring=Goofspiel.Scoring.ZEROSUM) s = io.StringIO() write_efg(g, s, names=False) assert (len(s.getvalue()) > 1024) s = io.StringIO() write_efg(g, s, names=True) assert (len(s.getvalue()) > 1024) g2 = Goofspiel(2, scoring=Goofspiel.Scoring.WINLOSS) s = io.StringIO() write_efg(g2, s, names=True) assert len(s.getvalue().splitlines()) == 40
def test_unit(): g = Goofspiel(4, scoring=Goofspiel.Scoring.ZEROSUM) mc = OutcomeMCCFR(g, seed=42) mc.compute(500) vs = GoofSpielCardsValueStore(g) val = SparseStochasticValueLearning(g, vs, seed=43) val.compute([mc, mc], 200, 0.01)
def xtest_server(): from gamegym.games import Goofspiel from gamegym.strategy import UniformStrategy g = Goofspiel(5) s = Server() s.play_game(g, [None, UniformStrategy()])
def test_approx_best_response_goofspiel(): for n_cards, its, br_value in [(3, 1000, 1.333), (4, 20000, 2.5)]: game = Goofspiel(n_cards, Goofspiel.Scoring.ZEROSUM) strategy = ApproxBestResponse(game, 0, [UniformStrategy()] * 2, iterations=its, seed=35) assert strategy.sample_value(its // 2) == pytest.approx(br_value, rel=0.2)
def test_best_response_goofspiel(): for n_cards, br_value in [(3, pytest.approx(4/3)), (4, pytest.approx(2.5))]: game = Goofspiel(n_cards, Goofspiel.Scoring.ZEROSUM) strategy = BestResponse(game, 0, {1: UniformStrategy()}) for k, v in strategy.best_responses.items(): reward = k[1][-1] assert reward not in v.values() or v.probability(reward) == 1.0 assert strategy.value == br_value
def non_test_goofspiel(): g = Goofspiel(4, scoring=Goofspiel.Scoring.ZEROSUM) mc = OutcomeMCCFR(g, seed=42) for s in [10, 100, 1000]: mc.compute(s) br = BestResponse(g, 0, [None, mc]) print( "Exploit after", s, np.mean([ g.play_strategies([br, mc], seed=i)[-1].values()[0] for i in range(1000) ])) vs = GoofSpielCardsValueStore(g) val = SparseStochasticValueLearning(g, vs, seed=43) for alpha in [0.1, 0.01, 0.01, 0.001, 0.0001]: print(alpha) val.compute([mc, mc], 200, alpha)
def main(): g = MatchingPennies() base = np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]]) plot_to_files(g, "plot_mccfr_trace_pennies", 3, 1500, 150, base=base, exploit_every=1) g = RockPaperScissors() plot_to_files(g, "plot_mccfr_trace_rps", 3, 1500, 150, burn=0.3, exploit_every=1) g = Goofspiel(4, scoring=Goofspiel.Scoring.ZEROSUM) plot_to_files( g, "plot_mccfr_trace_goof4", 6, 1000000, 1000, depth=6, burn=0.3, burn_from=3, exploit_every=1) g = Goofspiel(5, scoring=Goofspiel.Scoring.ZEROSUM) plot_to_files( g, "plot_mccfr_trace_goof5", 6, 1000000, 1000, depth=6, burn=0.3, burn_from=3, exploit_every=10) g = DicePoker(6) plot_to_files( g, "plot_mccfr_trace_dicepoker", 6, 500000, 500, depth=6, burn=0.3, burn_from=3, exploit_every=1)
def test_mccfr_goofspiel4(): g = Goofspiel(4, scoring=Goofspiel.Scoring.ZEROSUM) mc = OutcomeMCCFR(g, seed=49) mc.compute(10000, burn=0.5) mcs = mc.strategies for p in [0, 1]: exp = exploitability(g, p, mcs[p]) aexp = approx_exploitability(g, p, mcs[p], 10000, seed=31 + p) print(p, exp, aexp) assert exp == pytest.approx(0.7, abs=0.2) assert aexp == pytest.approx(0.7, abs=0.2)
def test_goofspeil(): g = Goofspiel(7) s = g.initial_state() assert s.player() == s.P_CHANCE assert s.score(0) == 0 assert s.score(1) == 0 assert s.actions() == list(range(7)) assert ( s.chance_distribution().probabilities() == (pytest.approx(1 / 7), ) * 7).all() for i, a in enumerate([3, 1, 0, 4, 3, 5, 5, 2, 2, 1, 4, 3, 2, 6]): s = s.play(a) assert s.player() == (i + 1) % 3 - 1 assert s.round() == 4 assert s.player() == 1 assert s.actions() == [1, 4, 6] assert s.winners() == [0, 1, -1, 0] assert ( s.chance_distribution().probabilities() == (pytest.approx(1.0 / 3), ) * 3).all() assert s.score(0) == 6 assert s.score(1) == 5 assert s.cards_in_hand(-1) == [0, 6] assert s.cards_in_hand(0) == [0, 5] assert s.cards_in_hand(1) == [1, 4, 6] for a in [1, 6, 5, 6, 0, 0, 0]: s = s.play(a) assert s.is_terminal() assert s.score(0) == 9 assert s.score(1) == 12 assert s.values() == (-1, 1)
def test_best_response_goofspiel(): for n_cards, br_value in [(3, pytest.approx(4 / 3)), (4, pytest.approx(2.5))]: game = Goofspiel(n_cards, Goofspiel.Scoring.ZEROSUM) strategy = BestResponse(game, 0, [UniformStrategy()] * 2) for k, v in strategy.best_responses.items(): reward = k[-1] played_cards = k[0::3] idx = len([ i for i in range(n_cards) if i < reward and i not in played_cards ]) assert reward in played_cards or v[idx] == 1.0 assert strategy.value == br_value
def main(): print("#### Rock-paper-scissors value estimation") g = RockPaperScissors() us = UniformStrategy() infosampler = InformationSetSampler(g, us) val = LPZeroSumValueLearning(g, infosampler, matrix_zerosum_features, us) # Regularize: set one payoff to 1.0 val.add_condition({(0, 1): 1.0}, 1.0) print("# With only non-triviality (one payoff set to 1.0)") print(val.compute()) print("Flex value sum", val.flex_sum) # Zero diagonal for i in range(3): val.add_condition({(i, i): 1.0}, 0.0) print("# With zero diagonal") print(val.compute()) print("Flex value sum", val.flex_sum) # Symmetrical payoffs for i in range(3): for j in range(i): val.add_condition({(i, j): -1.0, (j, i): -1.0}, 0.0) print("# Adding val(i,j) = -val(j,i)") print(val.compute()) print("Flex value sum", val.flex_sum) #return ### Goofspiel(3) is boring, Goofspiel(4) hits OOM print("#### Goofspiel(4) card value estimation") g = Goofspiel(4) mc = OutcomeMCCFR(g, seed=42) mc.compute(2000) ef = InfoSetExpectedFeatures(g, goofspiel_feaures_cards, mc) for i, f in ef.info_features.items(): print("INFOSET {}:\n{}".format(i, f)) print(ef.info_next[i]) return val = LPZeroSumValueLearning(g, infosampler, goofspiel_feaures_cards, mc) # Regularize: set one payoff to 1.0 val.add_condition({(0, ): 1.0, (1, ): 1.0, (2, ): 1.0, (3, ): 1.0}, 10.0) print("# Regularizing card values mean to 2.5 (mean of 1..4)") print(len(val.conds_eq), len(val.conds_le), len(val.flex_variables)) print( val.compute( options=dict(tol=1e-6, disp=True, sparse=True, lstsq=True))) print("Flex value sum", val.flex_sum)
def test_parse_gambit_strategy_g3(): g = Goofspiel(3, scoring=Goofspiel.Scoring.ZEROSUM) txt = "NE,1,0,0,1,0,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,1,0,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0,0,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,0,0,1,0,1,1,1,1,0,1,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,0,0,1,1,1,1,0,0,1,1,1,1,1,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0,0,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1" strats = parse_strategy(g, txt) assert exploitability(g, 0, strats[0]) < 1e-6 assert exploitability(g, 0, strats[1]) < 1e-6
def test_best_response_limit(): game = Goofspiel(3) BestResponse(game, 0, [UniformStrategy()] * 2) with pytest.raises(LimitExceeded, message="traversed more than"): BestResponse(game, 0, [UniformStrategy()] * 2, max_nodes=1024)
def test_goofspeil_rewards(): g = Goofspiel(2, Goofspiel.Scoring.ZEROSUM, rewards=[100, 11]) for _ in range(10): history = g.play_strategies([UniformStrategy(), UniformStrategy()]) t = history[-1] assert t.values() in ([0, 0], [-89, 89], [89, -89])