def generate(agent, worlds): buffer = [] while True: with torch.no_grad(): decisions = agent(worlds, value=True) new_worlds, transition = worlds.step(decisions.actions) buffer.append( arrdict.arrdict(obs=worlds.obs, seats=worlds.seats, v=decisions.v, terminal=transition.terminal, rewards=transition.rewards).detach()) # Waiting till the buffer matches the boardsize guarantees every traj is terminated if len(buffer) > worlds.boardsize**2: buffer = buffer[1:] chunk = arrdict.stack(buffer) terminal = torch.stack( [chunk.terminal for _ in range(worlds.n_seats)], -1) targets = reward_to_go(chunk.rewards.float(), chunk.v.float(), terminal) yield chunk.obs[0], chunk.seats[0], targets[0] else: if len(buffer) % worlds.boardsize == 0: log.info(f'Experience: {len(buffer)}/{worlds.boardsize**2}') worlds = new_worlds
def evaluate(run, idx, max_games=1024, target_std=.025): """ Memory usage: * 3b1w2d: 1.9G * 9b4096w1d: 2.5G """ worlds = common.worlds(run, 2) agent = common.agent(run, idx) arena = mohex.CumulativeArena(worlds) name = 'latest' if idx is None else f'snapshot.{idx}' start = time.time() trace = [] while True: soln, results = arena.play(agent) trace.append(soln) if soln.std < target_std: break if soln.games >= max_games: break rate = (time.time() - start)/(soln.games + 1e-6) log.info(f'{rate:.0f}s per game; {rate*soln.games:.0f}s so far, {rate*max_games:.0f}s expected') database.save(run, rename(results, name)) return arrdict.stack(trace), results
def _solve(n, w, soln=None, max_iter=100, tol=1e-9, **kwargs): n = torch.as_tensor(n) w = torch.as_tensor(w) #TODO: Find a better way of converting everything to double elbo = ELBO(n.size(0)).double() if soln is not None: elbo.μ.data[:] = torch.as_tensor(soln.μ) elbo.Σ = torch.as_tensor(soln.Σ) # The gradients around here can be a little explode-y; a line search is a bit slow but # keeps us falling up any cliffs. optim = torch.optim.LBFGS( elbo.parameters(), line_search_fn='strong_wolfe', tolerance_change=tol, max_iter=max_iter, **kwargs) trace = [] def closure(): l = -elbo(n, w) if torch.isnan(l): raise ValueError('Hit a nan.') optim.zero_grad() l.backward() grads = [p.grad for p in elbo.parameters()] paramnorm = torch.cat([p.data.flatten() for p in elbo.parameters()]).pow(2).mean().pow(.5) gradnorm = torch.cat([g.flatten() for g in grads]).pow(2).mean().pow(.5) relnorm = gradnorm/paramnorm trace.append(arrdict.arrdict( l=l, gradnorm=gradnorm, relnorm=relnorm, Σ=elbo.Σ).detach().clone()) return l try: optim.step(closure) closure() except ValueError as e: log.warn(f'activelo did not converge: "{str(e)}"') μd, σ2d = map(as_square, pairwise_diffs(elbo.μ, elbo.Σ)) return arrdict.arrdict( n=n, w=w, μ=elbo.μ, Σ=elbo.Σ, μd=μd, σd=σ2d**.5, trace=arrdict.stack(trace)).detach().numpy()
def as_chunk(buffer, batch_size): chunk = arrdict.stack(buffer) terminal = torch.stack( [chunk.transitions.terminal for _ in range(chunk.worlds.n_seats)], -1) chunk['reward_to_go'] = learning.reward_to_go( chunk.transitions.rewards.float(), chunk.decisions.v.float(), terminal).half() n_new = batch_size // terminal.size(1) chunk_stats(chunk, n_new) buffer = buffer[n_new:] return chunk, buffer
def random_empty_positions(geometries, n_agents, n_points): points = [] for g in geometries: sample = np.stack((g.masks > 0).nonzero(), -1) # There might be fewer open points than we're asking for n_possible = min(len(sample) // n_agents, n_points) sample = sample[np.random.choice(np.arange(len(sample)), (n_possible, n_agents), replace=True)] # So repeat the sample until we've got enough sample = np.concatenate([sample] * int(n_points / len(sample) + 1))[-n_points:] sample = np.random.permutation(sample) points.append( geometry.centers(sample, g.masks.shape, g.res).transpose(1, 0, 2)) return arrdict.stack(points)
def combine_decisions(dtrace, mtrace): agents = {a for d in dtrace for a in d} n_envs = next(iter(mtrace[0].values())).size(0) results = arrdict.arrdict() for a in agents: exemplar = [d[a] for d in dtrace if a in d][0] device = next(iter(arrdict.leaves(exemplar))).device a_results = [] for d, m in zip(dtrace, mtrace): expanded = exemplar.map(expand, n_envs=n_envs) if a in m: expanded[m[a]] = d[a] expanded['mask'] = m[a] else: expanded['mask'] = torch.zeros((n_envs,), dtype=bool, device=device) a_results.append(expanded) results[str(a)] = arrdict.stack(a_results) return results
def __init__(self, world, n_nodes=64, c_puct=1/16, noise_eps=.25, alpha_scale=10): """ c_puct high: concentrates on prior c_puct low: concentrates on value """ self.device = world.device self.n_envs = world.n_envs self.n_nodes = n_nodes self.n_seats = world.n_seats assert n_nodes > 1, 'MCTS requires at least two nodes' self.envs = torch.arange(world.n_envs, device=self.device) self.n_actions = np.prod(world.action_space) self.tree = arrdict.arrdict( children=self.envs.new_full((world.n_envs, self.n_nodes, self.n_actions), -1, dtype=torch.short), parents=self.envs.new_full((world.n_envs, self.n_nodes), -1, dtype=torch.short), relation=self.envs.new_full((world.n_envs, self.n_nodes), -1, dtype=torch.short)) self.worlds = arrdict.stack([world for _ in range(self.n_nodes)], 1) self.transitions = arrdict.arrdict( rewards=torch.full((world.n_envs, self.n_nodes, self.n_seats), 0., device=self.device, dtype=torch.half), terminal=torch.full((world.n_envs, self.n_nodes), False, device=self.device, dtype=torch.bool)) self.decisions = arrdict.arrdict( logits=torch.full((world.n_envs, self.n_nodes, self.n_actions), np.nan, device=self.device, dtype=torch.half), v=torch.full((world.n_envs, self.n_nodes, self.n_seats), np.nan, device=self.device, dtype=torch.half)) self.stats = arrdict.arrdict( n=torch.full((world.n_envs, self.n_nodes), 0, device=self.device, dtype=torch.short), w=torch.full((world.n_envs, self.n_nodes, self.n_seats), 0., device=self.device, dtype=torch.half)) self.sim = torch.tensor(0, device=self.device, dtype=torch.long) self.worlds[:, 0] = world # https://github.com/LeelaChessZero/lc0/issues/694 # Larger c_puct -> greater regularization self.c_puct = torch.full((world.n_envs,), c_puct, device=self.device, dtype=torch.half) self.noise_eps = noise_eps self.alpha_scale = alpha_scale
def benchmark(): import pickle with open('output/descent/hex.pkl', 'rb') as f: data = pickle.load(f) data['c_puct'] = torch.repeat_interleave(data.c_puct[:, None], data.logits.shape[1], 1) data = data.cuda() results = [] with aljpy.timer() as timer: torch.cuda.synchronize() for t in range(data.logits.shape[0]): m = cuda.mcts(**data[t]) results.append(cuda.descend(m)) torch.cuda.synchronize() results = arrdict.stack(results) time = timer.time() samples = results.parents.nelement() print(f'{1000*time:.0f}ms total, {1e9*time/samples:.0f}ns/descent') return results
def rollout(worlds, agents, n_steps=None, n_trajs=None, n_reps=None, **kwargs): assert sum(x is not None for x in (n_steps, n_trajs, n_reps)) == 1, 'Must specify exactly one of n_steps or n_trajs or n_reps' trace, dtrace, mtrace = [], [], [] steps, trajs = 0, 0 reps = torch.zeros(worlds.n_envs, device=worlds.device) while True: decisions, masks = {}, {} for i, agent in enumerate(agents): mask = worlds.seats == i if mask.any(): decisions[i] = agent(worlds[mask], **kwargs) masks[i] = mask actions = combine_actions(decisions, masks) worlds, transitions = worlds.step(actions) trace.append(arrdict.arrdict( actions=actions, transitions=transitions, worlds=worlds)) mtrace.append(masks) dtrace.append(decisions) steps += 1 if n_steps and (steps >= n_steps): break trajs += transitions.terminal.sum() if n_trajs and (trajs >= n_trajs): break reps += transitions.terminal if n_reps and (reps >= n_reps).all(): break trace = arrdict.stack(trace) trace['decisions'] = combine_decisions(dtrace, mtrace) return trace
def random_empty_positions(geometries, n_agents, n_points): """Returns a tensor of randomly-selected empty points in each :ref:`geometry <geometry>`. The returned tensor is a (n_geometries, n_agents, n_points, 2)-float tensor, with the coordinates given in meters. This is typcially used when you want to randomly move an agent to a new place, but *finding* an empty point at each timestep is too expensive. So instead this is used to generate ``n_points`` empty points in advance, and then when you need one you can choose from the pre-generated options. """ points = [] for g in geometries: sample = np.stack((g.masks > 0).nonzero(), -1) # There might be fewer open points than we're asking for n_possible = min(len(sample)//n_agents, n_points) sample = sample[np.random.choice(np.arange(len(sample)), (n_possible, n_agents), replace=True)] # So repeat the sample until we've got enough sample = np.concatenate([sample]*int(n_points/len(sample)+1))[-n_points:] sample = np.random.permutation(sample) points.append(geometry.centers(sample, g.masks.shape, g.res).transpose(1, 0, 2)) return arrdict.stack(points)
def as_chunk(buffer): chunk = arrdict.stack(buffer) with stats.defer(): stats.rate('sample-rate/actor', chunk.world.reset.nelement()) stats.mean('traj-length', chunk.world.reset.nelement(), chunk.world.reset.sum()) stats.cumsum('count/traj', chunk.world.reset.sum()) stats.cumsum('count/world', chunk.world.reset.size(0)) stats.cumsum('count/chunks', 1) stats.cumsum('count/samples', chunk.world.reset.nelement()) stats.rate('step-rate/chunks', 1) stats.rate('step-rate/world', chunk.world.reset.size(0)) stats.mean('step-reward', chunk.world.reward.sum(), chunk.world.reward.nelement()) stats.mean('traj-reward/mean', chunk.world.reward.sum(), chunk.world.reset.sum()) stats.mean('traj-reward/positive', chunk.world.reward.clamp(0, None).sum(), chunk.world.reset.sum()) stats.mean('traj-reward/negative', chunk.world.reward.clamp(None, 0).sum(), chunk.world.reset.sum()) return chunk
def simulate(truth, n_games=256, σresid_tol=.1): n_agents = len(truth) wins = torch.zeros((n_agents, n_agents)) games = torch.zeros((n_agents, n_agents)) trace = [] ranks = torch.full((n_agents, ), 0.) while True: soln = activelo.solve(games, wins) ranks = torch.as_tensor(soln.μ) black, white = activelo.suggest(soln) black_wins = torch.distributions.Binomial( n_games, winrate(truth[black], truth[white])).sample() wins[black, white] += black_wins wins[white, black] += n_games - black_wins games[black, white] += n_games games[white, black] += n_games soln['n'] = games.clone() soln['w'] = wins.clone() soln['σresid'] = residual_vs_mean(soln.Σ).mean()**.5 soln['resid_var'] = resid_var(ranks, truth) trace.append( arrdict.arrdict({k: v for k, v in soln.items() if k != 'trace'})) plt.close() from IPython import display display.clear_output(wait=True) display.display(plot(trace, truth)) if soln.σresid < σresid_tol: break trace = arrdict.stack(trace) return trace
def gradients(network, chunk): grads = [] for t in range(chunk.reward_to_go.size(0)): grads.append(gradient(network, chunk[t])) return arrdict.stack(grads)
def adam_over_time(run, B): import matplotlib.pyplot as plt from tqdm.auto import tqdm sizes = arrdict.stack( [adam_way(run, idx, B) for idx in tqdm(storage.snapshots(run))]) plt.plot(sizes)