Beispiel #1
0
def generate(agent, worlds):
    buffer = []
    while True:
        with torch.no_grad():
            decisions = agent(worlds, value=True)
        new_worlds, transition = worlds.step(decisions.actions)

        buffer.append(
            arrdict.arrdict(obs=worlds.obs,
                            seats=worlds.seats,
                            v=decisions.v,
                            terminal=transition.terminal,
                            rewards=transition.rewards).detach())

        # Waiting till the buffer matches the boardsize guarantees every traj is terminated
        if len(buffer) > worlds.boardsize**2:
            buffer = buffer[1:]
            chunk = arrdict.stack(buffer)
            terminal = torch.stack(
                [chunk.terminal for _ in range(worlds.n_seats)], -1)
            targets = reward_to_go(chunk.rewards.float(), chunk.v.float(),
                                   terminal)

            yield chunk.obs[0], chunk.seats[0], targets[0]
        else:
            if len(buffer) % worlds.boardsize == 0:
                log.info(f'Experience: {len(buffer)}/{worlds.boardsize**2}')

        worlds = new_worlds
Beispiel #2
0
def evaluate(run, idx, max_games=1024, target_std=.025):
    """
    Memory usage:
        * 3b1w2d: 1.9G
        * 9b4096w1d: 2.5G
    """

    worlds = common.worlds(run, 2)
    agent = common.agent(run, idx)
    arena = mohex.CumulativeArena(worlds)

    name = 'latest' if idx is None else f'snapshot.{idx}'

    start = time.time()
    trace = []
    while True:
        soln, results = arena.play(agent)
        trace.append(soln)
        if soln.std < target_std:
            break
        if soln.games >= max_games:
            break

        rate = (time.time() - start)/(soln.games + 1e-6)
        log.info(f'{rate:.0f}s per game; {rate*soln.games:.0f}s so far, {rate*max_games:.0f}s expected')

        database.save(run, rename(results, name))

    return arrdict.stack(trace), results
Beispiel #3
0
def _solve(n, w, soln=None, max_iter=100, tol=1e-9, **kwargs):
    n = torch.as_tensor(n)
    w = torch.as_tensor(w)
    #TODO: Find a better way of converting everything to double
    elbo = ELBO(n.size(0)).double()

    if soln is not None:
        elbo.μ.data[:] = torch.as_tensor(soln.μ)
        elbo.Σ = torch.as_tensor(soln.Σ)

    # The gradients around here can be a little explode-y; a line search is a bit slow but 
    # keeps us falling up any cliffs.
    optim = torch.optim.LBFGS(
        elbo.parameters(), 
        line_search_fn='strong_wolfe', 
        tolerance_change=tol,
        max_iter=max_iter,
        **kwargs)

    trace = []

    def closure():
        l = -elbo(n, w)
        if torch.isnan(l):
            raise ValueError('Hit a nan.')
        optim.zero_grad()
        l.backward()

        grads = [p.grad for p in elbo.parameters()]
        paramnorm = torch.cat([p.data.flatten() for p in elbo.parameters()]).pow(2).mean().pow(.5)
        gradnorm = torch.cat([g.flatten() for g in grads]).pow(2).mean().pow(.5)
        relnorm = gradnorm/paramnorm

        trace.append(arrdict.arrdict(
            l=l,
            gradnorm=gradnorm,
            relnorm=relnorm,
            Σ=elbo.Σ).detach().clone())

        return l

    try:
        optim.step(closure)
        closure()
    except ValueError as e:
        log.warn(f'activelo did not converge: "{str(e)}"')

    μd, σ2d = map(as_square, pairwise_diffs(elbo.μ, elbo.Σ))
    return arrdict.arrdict(
        n=n,
        w=w,
        μ=elbo.μ, 
        Σ=elbo.Σ, 
        μd=μd,
        σd=σ2d**.5,
        trace=arrdict.stack(trace)).detach().numpy()
Beispiel #4
0
def as_chunk(buffer, batch_size):
    chunk = arrdict.stack(buffer)
    terminal = torch.stack(
        [chunk.transitions.terminal for _ in range(chunk.worlds.n_seats)], -1)
    chunk['reward_to_go'] = learning.reward_to_go(
        chunk.transitions.rewards.float(), chunk.decisions.v.float(),
        terminal).half()

    n_new = batch_size // terminal.size(1)
    chunk_stats(chunk, n_new)

    buffer = buffer[n_new:]

    return chunk, buffer
Beispiel #5
0
def random_empty_positions(geometries, n_agents, n_points):
    points = []
    for g in geometries:
        sample = np.stack((g.masks > 0).nonzero(), -1)

        # There might be fewer open points than we're asking for
        n_possible = min(len(sample) // n_agents, n_points)
        sample = sample[np.random.choice(np.arange(len(sample)),
                                         (n_possible, n_agents),
                                         replace=True)]

        # So repeat the sample until we've got enough
        sample = np.concatenate([sample] *
                                int(n_points / len(sample) + 1))[-n_points:]
        sample = np.random.permutation(sample)
        points.append(
            geometry.centers(sample, g.masks.shape, g.res).transpose(1, 0, 2))
    return arrdict.stack(points)
Beispiel #6
0
def combine_decisions(dtrace, mtrace):
    agents = {a for d in dtrace for a in d}
    n_envs = next(iter(mtrace[0].values())).size(0)
    results = arrdict.arrdict()
    for a in agents:
        exemplar = [d[a] for d in dtrace if a in d][0]
        device = next(iter(arrdict.leaves(exemplar))).device

        a_results = []
        for d, m in zip(dtrace, mtrace):
            expanded = exemplar.map(expand, n_envs=n_envs)
            if a in m:
                expanded[m[a]] = d[a]
                expanded['mask'] = m[a]
            else:
                expanded['mask'] = torch.zeros((n_envs,), dtype=bool, device=device)
            a_results.append(expanded)
        results[str(a)] = arrdict.stack(a_results)
    return results
Beispiel #7
0
    def __init__(self, world, n_nodes=64, c_puct=1/16, noise_eps=.25, alpha_scale=10):
        """
        c_puct high: concentrates on prior
        c_puct low: concentrates on value
        """
        self.device = world.device
        self.n_envs = world.n_envs
        self.n_nodes = n_nodes
        self.n_seats = world.n_seats
        assert n_nodes > 1, 'MCTS requires at least two nodes'

        self.envs = torch.arange(world.n_envs, device=self.device)

        self.n_actions = np.prod(world.action_space)
        self.tree = arrdict.arrdict(
            children=self.envs.new_full((world.n_envs, self.n_nodes, self.n_actions), -1, dtype=torch.short),
            parents=self.envs.new_full((world.n_envs, self.n_nodes), -1, dtype=torch.short),
            relation=self.envs.new_full((world.n_envs, self.n_nodes), -1, dtype=torch.short))

        self.worlds = arrdict.stack([world for _ in range(self.n_nodes)], 1)
        
        self.transitions = arrdict.arrdict(
            rewards=torch.full((world.n_envs, self.n_nodes, self.n_seats), 0., device=self.device, dtype=torch.half),
            terminal=torch.full((world.n_envs, self.n_nodes), False, device=self.device, dtype=torch.bool))

        self.decisions = arrdict.arrdict(
            logits=torch.full((world.n_envs, self.n_nodes, self.n_actions), np.nan, device=self.device, dtype=torch.half),
            v=torch.full((world.n_envs, self.n_nodes, self.n_seats), np.nan, device=self.device, dtype=torch.half))

        self.stats = arrdict.arrdict(
            n=torch.full((world.n_envs, self.n_nodes), 0, device=self.device, dtype=torch.short),
            w=torch.full((world.n_envs, self.n_nodes, self.n_seats), 0., device=self.device, dtype=torch.half))

        self.sim = torch.tensor(0, device=self.device, dtype=torch.long)
        self.worlds[:, 0] = world

        # https://github.com/LeelaChessZero/lc0/issues/694
        # Larger c_puct -> greater regularization
        self.c_puct = torch.full((world.n_envs,), c_puct, device=self.device, dtype=torch.half)

        self.noise_eps = noise_eps
        self.alpha_scale = alpha_scale
Beispiel #8
0
def benchmark():
    import pickle
    with open('output/descent/hex.pkl', 'rb') as f:
        data = pickle.load(f)
        data['c_puct'] = torch.repeat_interleave(data.c_puct[:, None],
                                                 data.logits.shape[1], 1)
        data = data.cuda()

    results = []
    with aljpy.timer() as timer:
        torch.cuda.synchronize()
        for t in range(data.logits.shape[0]):
            m = cuda.mcts(**data[t])
            results.append(cuda.descend(m))
        torch.cuda.synchronize()
    results = arrdict.stack(results)
    time = timer.time()
    samples = results.parents.nelement()
    print(f'{1000*time:.0f}ms total, {1e9*time/samples:.0f}ns/descent')

    return results
Beispiel #9
0
def rollout(worlds, agents, n_steps=None, n_trajs=None, n_reps=None, **kwargs):
    assert sum(x is not None for x in (n_steps, n_trajs, n_reps)) == 1, 'Must specify exactly one of n_steps or n_trajs or n_reps'

    trace, dtrace, mtrace = [], [], []
    steps, trajs = 0, 0
    reps = torch.zeros(worlds.n_envs, device=worlds.device)
    while True:
        decisions, masks = {}, {}
        for i, agent in enumerate(agents):
            mask = worlds.seats == i
            if mask.any():
                decisions[i] = agent(worlds[mask], **kwargs)
                masks[i] = mask

        actions = combine_actions(decisions, masks)
        
        worlds, transitions = worlds.step(actions)
        trace.append(arrdict.arrdict(
            actions=actions,
            transitions=transitions,
            worlds=worlds))
        
        mtrace.append(masks)
        dtrace.append(decisions)

        steps += 1

        if n_steps and (steps >= n_steps):
            break
        trajs += transitions.terminal.sum()
        if n_trajs and (trajs >= n_trajs):
            break
        reps += transitions.terminal
        if n_reps and (reps >= n_reps).all():
            break

    trace = arrdict.stack(trace)
    trace['decisions'] = combine_decisions(dtrace, mtrace)

    return trace
Beispiel #10
0
def random_empty_positions(geometries, n_agents, n_points):
    """Returns a tensor of randomly-selected empty points in each :ref:`geometry <geometry>`.
    
    The returned tensor is a (n_geometries, n_agents, n_points, 2)-float tensor, with the coordinates given in meters.

    This is typcially used when you want to randomly move an agent to a new place, but *finding* an empty point at 
    each timestep is too expensive. So instead this is used to generate ``n_points`` empty points in advance, and then
    when you need one you can choose from the pre-generated options.
    """ 
    points = []
    for g in geometries:
        sample = np.stack((g.masks > 0).nonzero(), -1)

        # There might be fewer open points than we're asking for
        n_possible = min(len(sample)//n_agents, n_points)
        sample = sample[np.random.choice(np.arange(len(sample)), (n_possible, n_agents), replace=True)]

        # So repeat the sample until we've got enough
        sample = np.concatenate([sample]*int(n_points/len(sample)+1))[-n_points:]
        sample = np.random.permutation(sample)
        points.append(geometry.centers(sample, g.masks.shape, g.res).transpose(1, 0, 2))
    return arrdict.stack(points)
Beispiel #11
0
def as_chunk(buffer):
    chunk = arrdict.stack(buffer)
    with stats.defer():
        stats.rate('sample-rate/actor', chunk.world.reset.nelement())
        stats.mean('traj-length', chunk.world.reset.nelement(),
                   chunk.world.reset.sum())
        stats.cumsum('count/traj', chunk.world.reset.sum())
        stats.cumsum('count/world', chunk.world.reset.size(0))
        stats.cumsum('count/chunks', 1)
        stats.cumsum('count/samples', chunk.world.reset.nelement())
        stats.rate('step-rate/chunks', 1)
        stats.rate('step-rate/world', chunk.world.reset.size(0))
        stats.mean('step-reward', chunk.world.reward.sum(),
                   chunk.world.reward.nelement())
        stats.mean('traj-reward/mean', chunk.world.reward.sum(),
                   chunk.world.reset.sum())
        stats.mean('traj-reward/positive',
                   chunk.world.reward.clamp(0, None).sum(),
                   chunk.world.reset.sum())
        stats.mean('traj-reward/negative',
                   chunk.world.reward.clamp(None, 0).sum(),
                   chunk.world.reset.sum())
    return chunk
Beispiel #12
0
def simulate(truth, n_games=256, σresid_tol=.1):
    n_agents = len(truth)
    wins = torch.zeros((n_agents, n_agents))
    games = torch.zeros((n_agents, n_agents))

    trace = []
    ranks = torch.full((n_agents, ), 0.)
    while True:
        soln = activelo.solve(games, wins)
        ranks = torch.as_tensor(soln.μ)

        black, white = activelo.suggest(soln)
        black_wins = torch.distributions.Binomial(
            n_games, winrate(truth[black], truth[white])).sample()
        wins[black, white] += black_wins
        wins[white, black] += n_games - black_wins
        games[black, white] += n_games
        games[white, black] += n_games

        soln['n'] = games.clone()
        soln['w'] = wins.clone()
        soln['σresid'] = residual_vs_mean(soln.Σ).mean()**.5
        soln['resid_var'] = resid_var(ranks, truth)
        trace.append(
            arrdict.arrdict({k: v
                             for k, v in soln.items() if k != 'trace'}))

        plt.close()
        from IPython import display
        display.clear_output(wait=True)
        display.display(plot(trace, truth))
        if soln.σresid < σresid_tol:
            break

    trace = arrdict.stack(trace)

    return trace
Beispiel #13
0
def gradients(network, chunk):
    grads = []
    for t in range(chunk.reward_to_go.size(0)):
        grads.append(gradient(network, chunk[t]))
    return arrdict.stack(grads)
Beispiel #14
0
def adam_over_time(run, B):
    import matplotlib.pyplot as plt
    from tqdm.auto import tqdm
    sizes = arrdict.stack(
        [adam_way(run, idx, B) for idx in tqdm(storage.snapshots(run))])
    plt.plot(sizes)