def test_strategies(): g = RockPaperScissors() rng = get_rng(seed=41) s1 = [UniformStrategy(), UniformStrategy()] v1 = np.mean( [g.play_strategies(s1, rng=rng)[-1].values() for i in range(300)], 0) assert sum(v1) == pytest.approx(0.0) assert v1[0] == pytest.approx(0.0, abs=0.1) s2 = [ FixedStrategy(Explicit({ "R": 1.0, "P": 0.0, "S": 0.0 })), FixedStrategy(Explicit({ "R": 0.5, "P": 0.5, "S": 0.0 })) ] v2 = np.mean( [g.play_strategies(s2, rng=rng)[-1].values() for i in range(300)], 0) assert sum(v2) == pytest.approx(0.0) assert v2[0] == pytest.approx(-0.5, abs=0.1)
def test_goofspiel_rewards(): us = [UniformStrategy(), UniformStrategy()] g = Goofspiel(2, Goofspiel.Scoring.ZEROSUM, rewards=[100, 11]) for i in range(50): s = play_strategies(g, us, seed=i) assert tuple(s.payoff) in ((0.0, 0.0), (-89.0, 89.0), (89.0, -89.0)) g = Goofspiel(2, Goofspiel.Scoring.ABSOLUTE, rewards=[100, 11]) for i in range(50): s = play_strategies(g, us, seed=i) assert tuple(s.payoff) in ((0.0, 0.0), (100.0, 11.0), (11.0, 100.0))
def test_strategies(): g = RockPaperScissors() rng = get_rng(seed=41) s1 = [UniformStrategy(), UniformStrategy()] v1 = sample_payoff(g, s1, 300, rng=rng) assert sum(v1[0]) == pytest.approx(0.0) assert v1[0] == pytest.approx([0.0, 0.0], abs=0.1) s2 = [ ConstStrategy((1.0, 0.0, 0.0)), ConstStrategy((0.5, 0.5, 0.0)), ] v2 = sample_payoff(g, s2, 300, rng=rng) assert sum(v2[0]) == pytest.approx(0.0) assert v2[0] == pytest.approx([-0.5, 0.5], abs=0.1)
def xtest_server(): from gamegym.games import Goofspiel from gamegym.strategy import UniformStrategy g = Goofspiel(5) s = Server() s.play_game(g, [None, UniformStrategy()])
def test_best_response_goofspiel(): for n_cards, br_value in [(3, pytest.approx(4/3)), (4, pytest.approx(2.5))]: game = Goofspiel(n_cards, Goofspiel.Scoring.ZEROSUM) strategy = BestResponse(game, 0, {1: UniformStrategy()}) for k, v in strategy.best_responses.items(): reward = k[1][-1] assert reward not in v.values() or v.probability(reward) == 1.0 assert strategy.value == br_value
def test_approx_best_response_goofspiel(): for n_cards, its, br_value in [(3, 1000, 1.333), (4, 20000, 2.5)]: game = Goofspiel(n_cards, Goofspiel.Scoring.ZEROSUM) strategy = ApproxBestResponse(game, 0, [UniformStrategy()] * 2, iterations=its, seed=35) assert strategy.sample_value(its // 2) == pytest.approx(br_value, rel=0.2)
def test_rps(): g = RockPaperScissors() us = UniformStrategy() rng = get_rng(seed=3) params = rng.rand(3, 3) - 0.5 vs = LinearValueStore(params, fix_mean=0.0, regularize_l1=6.0) infosampler = InformationSetSampler(g, us) val = SparseSGDLinearValueLearning(g, matrix_zerosum_features, vs, infosampler, seed=44) val.compute([us, us], 100, 0.1, 0.1) val.compute([us, us], 100, 0.01, 0.01) val.compute([us, us], 100, 0.001, 0.001)
def test_mccfr_goofspiel3(): g = Goofspiel(3, scoring=Goofspiel.Scoring.ZEROSUM) mc = OutcomeMCCFR(g, seed=51) mc.compute(600, burn=0.5) mcs = mc.strategies us = UniformStrategy() s1 = g.play_sequence([2]) assert mcs[0].strategy(s1) == pytest.approx([0., 0.9, 0.], abs=0.1) assert sample_payoff(g, mcs, 300, seed=12)[0] == pytest.approx([0.0, 0.0], abs=0.1) assert sample_payoff(g, (mcs[0], us), 300, seed=13)[0] == pytest.approx([1.2, -1.2], abs=0.2) assert exploitability(g, 0, mcs[0]) < 0.1 assert exploitability(g, 1, mcs[1]) < 0.1
def test_best_response_goofspiel(): for n_cards, br_value in [(3, pytest.approx(4 / 3)), (4, pytest.approx(2.5))]: game = Goofspiel(n_cards, Goofspiel.Scoring.ZEROSUM) strategy = BestResponse(game, 0, [UniformStrategy()] * 2) for k, v in strategy.best_responses.items(): reward = k[-1] played_cards = k[0::3] idx = len([ i for i in range(n_cards) if i < reward and i not in played_cards ]) assert reward in played_cards or v[idx] == 1.0 assert strategy.value == br_value
def compute_mccfr_traces(g, prefix, n_traces, iters, steps, depth=6, burn=None, burn_from=0, add_uniform=True, exploit_every=None, eploit_max_nodes=1e6): """ Computes independent strategy traces of MCCFR in game `g`. """ traces = [] for ti in tqdm.trange(n_traces, desc=prefix): name = "MCCFR run #{}".format(ti) if burn and ti >= burn_from: name += " (burn-in)" mc = OutcomeMCCFR(g, seed=hash(str(g)) % 2**30 + ti) ps = StrategyTrace(g, depth=depth, name=name) for i in tqdm.trange(steps, desc="MCCFR steps"): w = 1.0 if burn and ti >= burn_from and i < steps * burn: w = 0.03**(1.0 - float(i) / steps / burn) mc.compute(int(iters * (i + 1) / steps) - mc.iterations, progress=False, weight=w) exps = None if exploit_every is not None and (steps - i - 1) % exploit_every == 0: exps = [ exploitability(g, p, mc.strategies[p], max_nodes=eploit_max_nodes) for p in range(g.players) ] ps.append(mc.iterations, mc.strategies, exps) traces.append(ps) if add_uniform: rps = StrategyTrace(g, depth=depth, name="Uniform") rstrat = [UniformStrategy()] * g.players rexps = None if exploit_every is not None: rexps = [ exploitability(g, p, rstrat[p], max_nodes=eploit_max_nodes) for p in range(g.players) ] for t in traces[0].d_t: rps.append(t, rstrat, rexps) traces.append(rps) return traces
def main(): print("#### Rock-paper-scissors value estimation") g = RockPaperScissors() us = UniformStrategy() infosampler = InformationSetSampler(g, us) val = LPZeroSumValueLearning(g, infosampler, matrix_zerosum_features, us) # Regularize: set one payoff to 1.0 val.add_condition({(0, 1): 1.0}, 1.0) print("# With only non-triviality (one payoff set to 1.0)") print(val.compute()) print("Flex value sum", val.flex_sum) # Zero diagonal for i in range(3): val.add_condition({(i, i): 1.0}, 0.0) print("# With zero diagonal") print(val.compute()) print("Flex value sum", val.flex_sum) # Symmetrical payoffs for i in range(3): for j in range(i): val.add_condition({(i, j): -1.0, (j, i): -1.0}, 0.0) print("# Adding val(i,j) = -val(j,i)") print(val.compute()) print("Flex value sum", val.flex_sum) #return ### Goofspiel(3) is boring, Goofspiel(4) hits OOM print("#### Goofspiel(4) card value estimation") g = Goofspiel(4) mc = OutcomeMCCFR(g, seed=42) mc.compute(2000) ef = InfoSetExpectedFeatures(g, goofspiel_feaures_cards, mc) for i, f in ef.info_features.items(): print("INFOSET {}:\n{}".format(i, f)) print(ef.info_next[i]) return val = LPZeroSumValueLearning(g, infosampler, goofspiel_feaures_cards, mc) # Regularize: set one payoff to 1.0 val.add_condition({(0, ): 1.0, (1, ): 1.0, (2, ): 1.0, (3, ): 1.0}, 10.0) print("# Regularizing card values mean to 2.5 (mean of 1..4)") print(len(val.conds_eq), len(val.conds_le), len(val.flex_variables)) print( val.compute( options=dict(tol=1e-6, disp=True, sparse=True, lstsq=True))) print("Flex value sum", val.flex_sum)
def test_infoset(): g = RockPaperScissors() us = UniformStrategy() iss = InformationSetSampler(g, [us, us]) assert iss._player_dist.probs == pytest.approx(np.array([0.5, 0.5])) assert iss._infoset_dist[0].probs == pytest.approx(np.array([1.0])) assert iss._infoset_dist[1].probs == pytest.approx(np.array([1.0])) assert iss._infoset_history_dist[0][()].probs == pytest.approx( np.array([1.0])) assert iss._infoset_history_dist[1][()].probs == pytest.approx( np.array([1.0, 1.0, 1.0]) / 3) iss.sample_player() iss.sample_info() assert iss.sample_info(0)[1] == () assert iss.sample_info(1)[1] == () assert isinstance(iss.sample_state()[2], Situation) assert isinstance(iss.player_distribution(), Distribution) assert isinstance(iss.info_distribution(0), Distribution) assert isinstance(iss.state_distribution(0, ()), Distribution)
def test_goofspeil_rewards(): g = Goofspiel(2, Goofspiel.Scoring.ZEROSUM, rewards=[100, 11]) for _ in range(10): history = g.play_strategies([UniformStrategy(), UniformStrategy()]) t = history[-1] assert t.values() in ([0, 0], [-89, 89], [89, -89])
def test_best_response_limit(): game = Goofspiel(3) BestResponse(game, 0, [UniformStrategy()] * 2) with pytest.raises(LimitExceeded, message="traversed more than"): BestResponse(game, 0, [UniformStrategy()] * 2, max_nodes=1024)