def test_tt(): tt = TickTock() tt.profile("test0") sleep(.01) tt.profile("test1") sleep(.01) tt.end_profile("test1") sleep(.01) tt.end_profile("test0") assert np.isclose(0.03, tt.profiles["test0"].sum(), 1) assert np.isclose(0.01, tt.profiles["test1"].sum(), 1)
def _log_method_results(self, description: str, pname: str, divider=1): threshold = 2 n = len(self.tt.profiles[pname]) removed = self.tt.profiles[pname].remove_outliers(threshold) self.log("\n".join([ description + ": " + TickTock.stringify_time(self.tt.profiles[pname].mean() / divider, TimeUnit.microsecond), "Mean: " + TickTock.stringify_time(self.tt.profiles[pname].mean(), TimeUnit.microsecond) + " p/m " +\ TickTock.stringify_time(norm.ppf(0.975) * self.tt.profiles[pname].std() / np.sqrt(n-removed), TimeUnit.nanosecond), "Std.: " + TickTock.stringify_time(self.tt.profiles[pname].std(), TimeUnit.microsecond), f"Removed {TickTock.thousand_seps(removed)} outliers with threshold {threshold} * mean.", f"Mean and std. are based on the remaining {TickTock.thousand_seps(n-removed)} measurements", ]))
class Agent: eps = np.finfo("float").eps _explored_states = 0 def __init__(self): self.action_queue = deque() self.tt = TickTock() @no_grad def search(self, state: np.ndarray, time_limit: float=None, max_states: int=None) -> bool: # Returns whether a path was found and generates action queue # Implement _step method for agents that look one step ahead, otherwise overwrite this method time_limit, max_states = self.reset(time_limit, max_states) self.tt.tick() if cube.is_solved(state): return True while self.tt.tock() < time_limit and len(self) < max_states: action, state, solution_found = self._step(state) self.action_queue.append(action) if solution_found: self._explored_states = len(self.action_queue) return True self._explored_states = len(self.action_queue) return False def _step(self, state: np.ndarray) -> (int, np.ndarray, bool): """ Takes a step given a stae :param state: numpy array containing a state :return: Action index, new state, is solved """ raise NotImplementedError def reset(self, time_limit: float, max_states: int): self._explored_states = 0 self.action_queue = deque() self.tt.reset() if hasattr(self, "net"): self.net.eval() assert time_limit or max_states time_limit = time_limit or 1e10 max_states = max_states or int(1e10) return time_limit, max_states def __str__(self): raise NotImplementedError def __len__(self): # Returns number of states explored return self._explored_states
def benchmark(): log = Logger("data/local_analyses/benchmarks.log", "Benchmarks") tt = TickTock() cube_bench = CubeBench(log, tt) # Cube config variables cn = int(1e7) multi_op_size = int(1e4) # Number of states used in multi operations store_repr() for repr_ in [True, False]: set_is2024(repr_) log.section( f"Benchmarking cube enviroment with {_repstr()} representation") tt.profile(f"Benchmarking cube environment, {_repstr()}") cube_bench.rotate(cn) cube_bench.multi_rotate(int(cn / multi_op_size), multi_op_size) cube_bench.onehot(cn) cube_bench.multi_onehot(int(cn / multi_op_size), multi_op_size) cube_bench.check_solution(cn) cube_bench.check_multi_solution(int(cn / multi_op_size), multi_op_size) tt.end_profile(f"Benchmarking cube environment, {_repstr()}") restore_repr() log.section("Benchmark runtime distribution") log(tt)
def __init__(self, n_games, scrambling_depths: range or list, max_time = None, # Max time to completion per game max_states = None, # The max number of states to explore per game logger: Logger = NullLogger() ): self.n_games = n_games self.max_time = max_time self.max_states = max_states self.tt = TickTock() self.log = logger # Use array of scrambling of scrambling depths if not deep evaluation else just a one element array with 0 self.scrambling_depths = np.array(scrambling_depths) if scrambling_depths != range(0) else np.array([0]) self.log("\n".join([ "Creating evaluator", f"Games per scrambling depth: {self.n_games}", f"Scrambling depths: {scrambling_depths if self._isdeep() else 'Uniformly sampled in [100, 999]'}", ]))
def __init__(self, rollouts: int, batch_size: int, # Required to be > 1 when training with batchnorm rollout_games: int, rollout_depth: int, optim_fn, alpha_update: float, lr: float, gamma: float, update_interval: int, agent: DeepAgent, evaluator: Evaluator, evaluation_interval: int, with_analysis: bool, tau: float, reward_method: str, policy_criterion = torch.nn.CrossEntropyLoss, value_criterion = torch.nn.MSELoss, logger: Logger = NullLogger(), ): """Sets up evaluation array, instantiates critera and stores and documents settings :param bool with_analysis: If true, a number of statistics relating to loss behaviour and model output are stored. :param float alpha_update: alpha <- alpha + alpha_update every update_interval rollouts (excl. rollout 0) :param float gamma: lr <- lr * gamma every update_interval rollouts (excl. rollout 0) :param float tau: How much of the new network to use to generate ADI data """ self.rollouts = rollouts self.train_rollouts = np.arange(self.rollouts) self.batch_size = self.states_per_rollout if not batch_size else batch_size self.rollout_games = rollout_games self.rollout_depth = rollout_depth self.adi_ff_batches = 1 # Number of batches used for feedforward in ADI_traindata. Used to limit vram usage self.reward_method = reward_method # Perform evaluation every evaluation_interval and after last rollout if evaluation_interval: self.evaluation_rollouts = np.arange(0, self.rollouts, evaluation_interval)-1 if evaluation_interval == 1: self.evaluation_rollouts = self.evaluation_rollouts[1:] else: self.evaluation_rollouts[0] = 0 if self.rollouts-1 != self.evaluation_rollouts[-1]: self.evaluation_rollouts = np.append(self.evaluation_rollouts, self.rollouts-1) else: self.evaluation_rollouts = np.array([]) self.agent = agent self.tau = tau self.alpha_update = alpha_update self.lr = lr self.gamma = gamma self.update_interval = update_interval # How often alpha and lr are updated self.optim = optim_fn self.policy_criterion = policy_criterion(reduction='none') self.value_criterion = value_criterion(reduction='none') self.evaluator = evaluator self.log = logger self.log("\n".join([ "Created trainer", f"Alpha update: {self.alpha_update:.2f}", f"Learning rate and gamma: {self.lr} and {self.gamma}", f"Learning rate and alpha will update every {self.update_interval} rollouts: lr <- {self.gamma:.4f} * lr and alpha += {self.alpha_update:.4f}"\ if self.update_interval else "Learning rate and alpha will not be updated during training", f"Optimizer: {self.optim}", f"Policy and value criteria: {self.policy_criterion} and {self.value_criterion}", f"Rollouts: {self.rollouts}", f"Batch size: {self.batch_size}", f"Rollout games: {self.rollout_games}", f"Rollout depth: {self.rollout_depth}", f"alpha update: {self.alpha_update}", ])) self.with_analysis = with_analysis if self.with_analysis: self.analysis = TrainAnalysis(self.evaluation_rollouts, self.rollout_games, self.rollout_depth, extra_evals=100, reward_method=reward_method, logger=self.log) #Logger should not be set in standard use self.tt = TickTock()
class Train: states_per_rollout: int train_rollouts: np.ndarray value_losses: np.ndarray policy_losses: np.ndarray train_losses: np.ndarray sol_percents: list def __init__(self, rollouts: int, batch_size: int, # Required to be > 1 when training with batchnorm rollout_games: int, rollout_depth: int, optim_fn, alpha_update: float, lr: float, gamma: float, update_interval: int, agent: DeepAgent, evaluator: Evaluator, evaluation_interval: int, with_analysis: bool, tau: float, reward_method: str, policy_criterion = torch.nn.CrossEntropyLoss, value_criterion = torch.nn.MSELoss, logger: Logger = NullLogger(), ): """Sets up evaluation array, instantiates critera and stores and documents settings :param bool with_analysis: If true, a number of statistics relating to loss behaviour and model output are stored. :param float alpha_update: alpha <- alpha + alpha_update every update_interval rollouts (excl. rollout 0) :param float gamma: lr <- lr * gamma every update_interval rollouts (excl. rollout 0) :param float tau: How much of the new network to use to generate ADI data """ self.rollouts = rollouts self.train_rollouts = np.arange(self.rollouts) self.batch_size = self.states_per_rollout if not batch_size else batch_size self.rollout_games = rollout_games self.rollout_depth = rollout_depth self.adi_ff_batches = 1 # Number of batches used for feedforward in ADI_traindata. Used to limit vram usage self.reward_method = reward_method # Perform evaluation every evaluation_interval and after last rollout if evaluation_interval: self.evaluation_rollouts = np.arange(0, self.rollouts, evaluation_interval)-1 if evaluation_interval == 1: self.evaluation_rollouts = self.evaluation_rollouts[1:] else: self.evaluation_rollouts[0] = 0 if self.rollouts-1 != self.evaluation_rollouts[-1]: self.evaluation_rollouts = np.append(self.evaluation_rollouts, self.rollouts-1) else: self.evaluation_rollouts = np.array([]) self.agent = agent self.tau = tau self.alpha_update = alpha_update self.lr = lr self.gamma = gamma self.update_interval = update_interval # How often alpha and lr are updated self.optim = optim_fn self.policy_criterion = policy_criterion(reduction='none') self.value_criterion = value_criterion(reduction='none') self.evaluator = evaluator self.log = logger self.log("\n".join([ "Created trainer", f"Alpha update: {self.alpha_update:.2f}", f"Learning rate and gamma: {self.lr} and {self.gamma}", f"Learning rate and alpha will update every {self.update_interval} rollouts: lr <- {self.gamma:.4f} * lr and alpha += {self.alpha_update:.4f}"\ if self.update_interval else "Learning rate and alpha will not be updated during training", f"Optimizer: {self.optim}", f"Policy and value criteria: {self.policy_criterion} and {self.value_criterion}", f"Rollouts: {self.rollouts}", f"Batch size: {self.batch_size}", f"Rollout games: {self.rollout_games}", f"Rollout depth: {self.rollout_depth}", f"alpha update: {self.alpha_update}", ])) self.with_analysis = with_analysis if self.with_analysis: self.analysis = TrainAnalysis(self.evaluation_rollouts, self.rollout_games, self.rollout_depth, extra_evals=100, reward_method=reward_method, logger=self.log) #Logger should not be set in standard use self.tt = TickTock() def train(self, net: Model) -> (Model, Model): """ Training loop: generates data, optimizes parameters, evaluates (sometimes) and repeats. Trains `net` for `self.rollouts` rollouts each consisting of `self.rollout_games` games and scrambled `self.rollout_depth`. The network is evaluated for each rollout number in `self.evaluations` according to `self.evaluator`. Stores multiple performance and training results. :param torch.nn.Model net: The network to be trained. Must accept input consistent with cube.get_oh_size() :return: The network after all evaluations and the network with the best evaluation score (win fraction) :rtype: (torch.nn.Model, torch.nn.Model) """ self.tt.reset() self.tt.tick() self.states_per_rollout = self.rollout_depth * self.rollout_games self.log(f"Beginning training. Optimization is performed in batches of {self.batch_size}") self.log("\n".join([ f"Rollouts: {self.rollouts}", f"Each consisting of {self.rollout_games} games with a depth of {self.rollout_depth}", f"Evaluations: {len(self.evaluation_rollouts)}", ])) best_solve = 0 best_net = net.clone() self.agent.net = net if self.with_analysis: self.analysis.orig_params = net.get_params() generator_net = net.clone() alpha = 1 if self.alpha_update == 1 else 0 optimizer = self.optim(net.parameters(), lr=self.lr) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, self.gamma) self.policy_losses = np.zeros(self.rollouts) self.value_losses = np.zeros(self.rollouts) self.train_losses = np.empty(self.rollouts) self.sol_percents = list() for rollout in range(self.rollouts): reset_cuda() generator_net = self._update_gen_net(generator_net, net) if self.tau != 1 else net self.tt.profile("ADI training data") training_data, policy_targets, value_targets, loss_weights = self.ADI_traindata(generator_net, alpha) self.tt.profile("To cuda") training_data = training_data.to(gpu) policy_targets = policy_targets.to(gpu) value_targets = value_targets.to(gpu) loss_weights = loss_weights.to(gpu) self.tt.end_profile("To cuda") self.tt.end_profile("ADI training data") reset_cuda() self.tt.profile("Training loop") net.train() batches = self._get_batches(self.states_per_rollout, self.batch_size) for i, batch in enumerate(batches): optimizer.zero_grad() policy_pred, value_pred = net(training_data[batch], policy=True, value=True) # Use loss on both policy and value policy_loss = self.policy_criterion(policy_pred, policy_targets[batch]) * loss_weights[batch] value_loss = self.value_criterion(value_pred.squeeze(), value_targets[batch]) * loss_weights[batch] loss = torch.mean(policy_loss + value_loss) loss.backward() optimizer.step() self.policy_losses[rollout] += policy_loss.detach().cpu().numpy().mean() / len(batches) self.value_losses[rollout] += value_loss.detach().cpu().numpy().mean() / len(batches) if self.with_analysis: #Save policy output to compute entropy with torch.no_grad(): self.analysis.rollout_policy.append( torch.nn.functional.softmax(policy_pred.detach(), dim=0).cpu().numpy() ) self.train_losses[rollout] = (self.policy_losses[rollout] + self.value_losses[rollout]) self.tt.end_profile("Training loop") # Updates learning rate and alpha if rollout and self.update_interval and rollout % self.update_interval == 0: if self.gamma != 1: lr_scheduler.step() lr = optimizer.param_groups[0]["lr"] self.log(f"Updated learning rate from {lr/self.gamma:.2e} to {lr:.2e}") if (alpha + self.alpha_update <= 1 or np.isclose(alpha + self.alpha_update, 1)) and self.alpha_update: alpha += self.alpha_update self.log(f"Updated alpha from {alpha-self.alpha_update:.2f} to {alpha:.2f}") elif alpha < 1 and alpha + self.alpha_update > 1 and self.alpha_update: self.log(f"Updated alpha from {alpha:.2f} to 1") alpha = 1 if self.log.is_verbose() or rollout in (np.linspace(0, 1, 20)*self.rollouts).astype(int): self.log(f"Rollout {rollout} completed with mean loss {self.train_losses[rollout]}") if self.with_analysis: self.tt.profile("Analysis of rollout") self.analysis.rollout(net, rollout, value_targets) self.tt.end_profile("Analysis of rollout") if rollout in self.evaluation_rollouts: net.eval() self.agent.net = net self.tt.profile(f"Evaluating using agent {self.agent}") with unverbose: eval_results, _, _ = self.evaluator.eval(self.agent) eval_reward = (eval_results != -1).mean() self.sol_percents.append(eval_reward) self.tt.end_profile(f"Evaluating using agent {self.agent}") if eval_reward > best_solve: best_solve = eval_reward best_net = net.clone() self.log(f"Updated best net with solve rate {eval_reward*100:.2f} % at depth {self.evaluator.scrambling_depths}") self.log.section("Finished training") if len(self.evaluation_rollouts): self.log(f"Best net solves {best_solve*100:.2f} % of games at depth {self.evaluator.scrambling_depths}") self.log.verbose("Training time distribution") self.log.verbose(self.tt) total_time = self.tt.tock() eval_time = self.tt.profiles[f'Evaluating using agent {self.agent}'].sum() if len(self.evaluation_rollouts) else 0 train_time = self.tt.profiles["Training loop"].sum() adi_time = self.tt.profiles["ADI training data"].sum() nstates = self.rollouts * self.rollout_games * self.rollout_depth * cube.action_dim states_per_sec = int(nstates / (adi_time+train_time)) self.log("\n".join([ f"Total running time: {self.tt.stringify_time(total_time, TimeUnit.second)}", f"- Training data for ADI: {self.tt.stringify_time(adi_time, TimeUnit.second)} or {adi_time/total_time*100:.2f} %", f"- Training time: {self.tt.stringify_time(train_time, TimeUnit.second)} or {train_time/total_time*100:.2f} %", f"- Evaluation time: {self.tt.stringify_time(eval_time, TimeUnit.second)} or {eval_time/total_time*100:.2f} %", f"States witnessed incl. substates: {TickTock.thousand_seps(nstates)}", f"- Per training second: {TickTock.thousand_seps(states_per_sec)}", ])) return net, best_net def _get_adi_ff_slices(self): data_points = self.rollout_games * self.rollout_depth * cube.action_dim slice_size = data_points // self.adi_ff_batches + 1 # Final slice may have overflow, however this is simply ignored when indexing slices = [slice(i*slice_size, (i+1)*slice_size) for i in range(self.adi_ff_batches)] return slices @no_grad def ADI_traindata(self, net, alpha: float): """ Training data generation Implements Autodidactic Iteration as per McAleer, Agostinelli, Shmakov and Baldi, "Solving the Rubik's Cube Without Human Knowledge" section 4.1 Loss weighting is dependant on `self.loss_weighting`. :param torch.nn.Model net: The network used for generating the training data. This should according to ADI be the network from the last rollout. :param int rollout: The current rollout number. Used in adaptive loss weighting. :return: Games * sequence_length number of observations divided in four arrays - states contains the rubiks state for each data point - policy_targets and value_targets contains optimal value and policy targets for each training point - loss_weights contains the weight for each training point (see weighted samples subsection of McAleer et al paper) :rtype: (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor) """ net.eval() self.tt.profile("Scrambling") # Only include solved state in training if using Max Lapan convergence fix states, oh_states = cube.sequence_scrambler(self.rollout_games, self.rollout_depth, with_solved = self.reward_method == 'lapanfix') self.tt.end_profile("Scrambling") # Keeps track of solved states - Max Lapan's convergence fix solved_scrambled_states = cube.multi_is_solved(states) # Generates possible substates for all scrambled states. Shape: n_states*action_dim x *Cube_shape self.tt.profile("ADI substates") substates = cube.multi_rotate(np.repeat(states, cube.action_dim, axis=0), *cube.iter_actions(len(states))) self.tt.end_profile("ADI substates") self.tt.profile("One-hot encoding") substates_oh = cube.as_oh(substates) self.tt.end_profile("One-hot encoding") self.tt.profile("Reward") solved_substates = cube.multi_is_solved(substates) # Reward for won state is 1 normally but 0 if running with reward0 rewards = (torch.zeros if self.reward_method == 'reward0' else torch.ones)\ (*solved_substates.shape) rewards[~solved_substates] = -1 self.tt.end_profile("Reward") # Generates policy and value targets self.tt.profile("ADI feedforward") while True: try: value_parts = [net(substates_oh[slice_], policy=False, value=True).squeeze() for slice_ in self._get_adi_ff_slices()] values = torch.cat(value_parts).cpu() break except RuntimeError as e: # Usually caused by running out of vram. If not, the error is still raised, else batch size is reduced if "alloc" not in str(e): raise e self.log.verbose(f"Intercepted RuntimeError {e}\nIncreasing number of ADI feed forward batches from {self.adi_ff_batches} to {self.adi_ff_batches*2}") self.adi_ff_batches *= 2 self.tt.end_profile("ADI feedforward") self.tt.profile("Calculating targets") values += rewards values = values.reshape(-1, 12) policy_targets = torch.argmax(values, dim=1) value_targets = values[np.arange(len(values)), policy_targets] if self.reward_method == 'lapanfix': # Trains on goal state, sets goalstate to 0 value_targets[solved_scrambled_states] = 0 elif self.reward_method == 'schultzfix': # Does not train on goal state, but sets first 12 substates to 0 first_substates = np.zeros(len(states), dtype=bool) first_substates[np.arange(0, len(states), self.rollout_depth)] = True value_targets[first_substates] = 0 self.tt.end_profile("Calculating targets") # Weighting examples according to alpha weighted = np.tile(1 / np.arange(1, self.rollout_depth+1), self.rollout_games) unweighted = np.ones_like(weighted) ws, us = weighted.sum(), len(unweighted) loss_weights = ((1-alpha) * weighted / ws + alpha * unweighted / us) * (ws + us) if self.with_analysis: self.tt.profile("ADI analysis") self.analysis.ADI(values) self.tt.end_profile("ADI analysis") return oh_states, policy_targets, value_targets, torch.from_numpy(loss_weights).float() def _update_gen_net(self, generator_net: Model, net: Model): """Create a network with parameters weighted by self.tau""" self.tt.profile("Creating generator network") genparams, netparams = generator_net.state_dict(), net.state_dict() new_genparams = dict(genparams) for pname, param in netparams.items(): new_genparams[pname].data.copy_( self.tau * param.data.to(gpu) + (1-self.tau) * new_genparams[pname].data.to(gpu) ) generator_net.load_state_dict(new_genparams) self.tt.end_profile("Creating generator network") return generator_net.to(gpu) def plot_training(self, save_dir: str, name: str, semi_logy=False, show=False): """ Visualizes training by showing training loss + evaluation reward in same plot """ self.log("Making plot of training") fig, loss_ax = plt.subplots(figsize=(23, 10)) colour = "red" loss_ax.set_ylabel("Training loss") loss_ax.plot(self.train_rollouts, self.train_losses, linewidth=3, color=colour, label="Training loss") loss_ax.plot(self.train_rollouts, self.policy_losses, linewidth=2, linestyle="dashdot", color="orange", label="Policy loss") loss_ax.plot(self.train_rollouts, self.value_losses, linewidth=2, linestyle="dashed", color="green", label="Value loss") loss_ax.tick_params(axis='y', labelcolor=colour) loss_ax.set_xlabel(f"Rollout, each of {TickTock.thousand_seps(self.states_per_rollout)} states") loss_ax.set_ylim(np.array([-0.05*1.35, 1.35]) * self.train_losses.max()) h1, l1 = loss_ax.get_legend_handles_labels() if len(self.evaluation_rollouts): color = 'blue' reward_ax = loss_ax.twinx() reward_ax.set_ylim([-5, 105]) reward_ax.set_ylabel("Solve rate (~95 % CI) [%]") sol_shares = np.array(self.sol_percents) bernoulli_errors = bernoulli_error(sol_shares, self.evaluator.n_games, alpha=0.05) reward_ax.errorbar(self.evaluation_rollouts, sol_shares*100, bernoulli_errors*100, fmt="-o", capsize=10, color=color, label="Policy performance", errorevery=2, alpha=0.8) reward_ax.tick_params(axis='y', labelcolor=color) h2, l2 = reward_ax.get_legend_handles_labels() h1 += h2 l1 += l2 loss_ax.legend(h1, l1, loc=2) title = (f"Training - {TickTock.thousand_seps(self.rollouts*self.rollout_games*self.rollout_depth)} states") plt.title(title) fig.tight_layout() if semi_logy: plt.semilogy() plt.grid(True) os.makedirs(save_dir, exist_ok=True) path = os.path.join(save_dir, f"training_{name}.png") plt.savefig(path) self.log(f"Saved loss and evaluation plot to {path}") if show: plt.show() plt.clf() @staticmethod def _get_batches(size: int, bsize: int): """ Generates indices for batch """ nbatches = int(np.ceil(size/bsize)) idcs = np.arange(size) np.random.shuffle(idcs) batches = [slice(batch*bsize, (batch+1)*bsize) for batch in range(nbatches)] batches[-1] = slice(batches[-1].start, size) return batches
class Evaluator: def __init__(self, n_games, scrambling_depths: range or list, max_time = None, # Max time to completion per game max_states = None, # The max number of states to explore per game logger: Logger = NullLogger() ): self.n_games = n_games self.max_time = max_time self.max_states = max_states self.tt = TickTock() self.log = logger # Use array of scrambling of scrambling depths if not deep evaluation else just a one element array with 0 self.scrambling_depths = np.array(scrambling_depths) if scrambling_depths != range(0) else np.array([0]) self.log("\n".join([ "Creating evaluator", f"Games per scrambling depth: {self.n_games}", f"Scrambling depths: {scrambling_depths if self._isdeep() else 'Uniformly sampled in [100, 999]'}", ])) def _isdeep(self): return self.scrambling_depths.size == 1 and self.scrambling_depths[0] == 0 def approximate_time(self): return self.max_time * self.n_games * len(self.scrambling_depths) def _eval_game(self, agent: agents.Agent, depth: int, profile: str): turns_to_complete = -1 # -1 for unfinished state, _, _ = cube.scramble(depth, True) self.tt.profile(profile) solution_found = agent.search(state, self.max_time, self.max_states) dt = self.tt.end_profile(profile) if solution_found: turns_to_complete = len(agent.action_queue) return turns_to_complete, dt def eval(self, agent: agents.Agent) -> (np.ndarray, np.ndarray, np.ndarray): """ Evaluates an agent Returns results which is an a len(self.scrambling_depths) x self.n_games matrix Each entry contains the number of steps needed to solve the scrambled cube or -1 if not solved """ self.log.section(f"Evaluation of {agent}") self.log("\n".join([ f"{self.n_games*len(self.scrambling_depths)} cubes", f"Maximum solve time per cube is {TickTock.stringify_time(self.max_time, TimeUnit.second)} " f"and estimated total time <= {TickTock.stringify_time(self.approximate_time(), TimeUnit.minute)}" if self.max_time else "No time limit given", f"Maximum number of explored states is {TickTock.thousand_seps(self.max_states)}" if self.max_states else "No max states given", ])) res = [] states = [] times = [] for d in self.scrambling_depths: for _ in range(self.n_games): if self._isdeep(): # Randomly sample evaluation depth for deep evaluations d = np.random.randint(100, 1000) p = f"Evaluation of {agent}. Depth {'100 - 999' if self._isdeep() else d}" r, dt = self._eval_game(agent, d, p) res.append(r) states.append(len(agent)) times.append(dt) if not self._isdeep(): self.log.verbose(f"Performed evaluation at depth: {d}/{self.scrambling_depths[-1]}") res = np.reshape(res, (len(self.scrambling_depths), self.n_games)) states = np.reshape(states, (len(self.scrambling_depths), self.n_games)) times = np.reshape(times, (len(self.scrambling_depths), self.n_games)) self.log(f"Evaluation results") for i, d in enumerate(self.scrambling_depths): self.log_this_depth(res[i], states[i], times[i], d) self.log.verbose(f"Evaluation runtime\n{self.tt}") return res, states, times def log_this_depth(self, res: np.ndarray, states: np.ndarray, times: np.ndarray, depth: int): """Logs summary statistics for given depth :param res: Vector of results :param states: Vector of seen states for each game :param times: Vector of runtimes for each game :param depth: Scrambling depth at which results were generated """ share_completed = np.count_nonzero(res!=-1)*100/len(res) won_games = res[res!=-1] self.log(f"Scrambling depth {depth if depth else 'deep'}", with_timestamp=False) self.log( f"\tShare completed: {share_completed:.2f} % {bernoulli_error(share_completed/100, len(res), 0.05, stringify=True)} (approx. 95 % CI)", with_timestamp=False ) if won_games.size: mean_turns = won_games.mean() median_turns = np.median(won_games) std_turns = won_games.std() self.log( f"\tTurns to win: {mean_turns:.2f} +/- {std_turns:.1f} (std.), Median: {median_turns:.0f}", with_timestamp=False ) safe_times = times != 0 states_per_sec = states[safe_times] / times[safe_times] self.log( f"\tStates seen: Pr. game: {states.mean():.2f} +/- {states.std():.0f} (std.), "\ f"Pr. sec.: {states_per_sec.mean():.2f} +/- {states_per_sec.std():.0f} (std.)", with_timestamp=False) self.log(f"\tTime: {times.mean():.2f} +/- {times.std():.2f} (std.)", with_timestamp=False) @classmethod def plot_evaluators(cls, eval_results: dict, eval_states: dict, eval_times: dict, eval_settings: dict, save_dir: str, title: str='') -> list: """ Plots evaluation results :param eval_results: { agent name: [steps to solve, -1 for unfinished] } :param eval_states: { agent name: [states seen during solving] } :param eval_times: { agent name: [time spent solving] } :param eval_settings: { agent name: { 'n_games': int, 'max_time': float, 'max_states': int, 'scrambling_depths': np.ndarray } } :param save_dir: Directory in which to save plots :param title: If given, overrides auto generated title in (depth, winrate) plot :return: Locations of saved plots """ assert eval_results.keys() == eval_results.keys() == eval_times.keys() == eval_settings.keys(), "Keys of evaluation dictionaries should match" os.makedirs(save_dir, exist_ok=True) tab_colours = list(mcolour.TABLEAU_COLORS) colours = [tab_colours[i%len(tab_colours)] for i in range(len(eval_results))] save_paths = [ cls._plot_depth_win(eval_results, save_dir, eval_settings, colours, title), cls._sol_length_boxplots(eval_results, save_dir, eval_settings, colours), ] # Only plot (time, winrate), (states, winrate), and their distributions if settings are the same if all(cls.check_equal_settings(eval_settings)): d = cls._get_a_value(eval_settings)["scrambling_depths"][-1] save_paths.extend([ cls._time_states_winrate_plot(eval_results, eval_times, True, d, save_dir, eval_settings, colours), cls._time_states_winrate_plot(eval_results, eval_states, False, d, save_dir, eval_settings, colours), ]) p = cls._distribution_plots(eval_results, eval_times, eval_states, d, save_dir, eval_settings, colours) if p != "ERROR": save_paths.extend(p) return save_paths @classmethod def _plot_depth_win(cls, eval_results: dict, save_dir: str, eval_settings: dict, colours: list, title: str='') -> str: # depth, win%-graph games_equal, times_equal = cls.check_equal_settings(eval_settings) fig, ax = plt.subplots(figsize=(19.2, 10.8)) ax.set_ylabel(f"Percentage of {cls._get_a_value(eval_settings)['n_games']} games won" if games_equal else "Percentage of games won") ax.set_xlabel(f"Scrambling depth: Number of random rotations applied to cubes") ax.locator_params(axis='x', integer=True, tight=True) for i, (agent, results) in enumerate(eval_results.items()): used_settings = eval_settings[agent] color = colours[i] win_percentages = (results != -1).mean(axis=1) * 100 ax.plot(used_settings['scrambling_depths'], win_percentages, linestyle='dashdot', color=color) ax.scatter(used_settings['scrambling_depths'], win_percentages, color=color, label=agent) ax.legend() ax.set_ylim([-5, 105]) ax.grid(True) ax.set_title(title if title else (f"Percentage of cubes solved in {cls._get_a_value(eval_settings)['max_time']:.2f} seconds" if times_equal else "Cubes solved")) fig.tight_layout() path = os.path.join(save_dir, "eval_winrates.png") plt.savefig(path) plt.clf() return path @classmethod def _sol_length_boxplots(cls, eval_results: dict, save_dir: str, eval_settings: dict, colours: list) -> str: # Solution length boxplots plt.rcParams.update(rc_params_small) max_width = 2 width = min(len(eval_results), max_width) height = (len(eval_results)+1) // width if width == max_width else 1 positions = [(i, j) for i in range(height) for j in range(width)] fig, axes = plt.subplots(height, width, figsize=(width*10, height*6)) max_sollength = 50 agents, agent_results = list(zip(*eval_results.items())) agent_results = tuple(x.copy() for x in agent_results) for res in agent_results: res[res > max_sollength] = max_sollength ylim = np.array([-0.02, 1.02]) * max([res.max() for res in agent_results]) min_ = min([x["scrambling_depths"][0] for x in eval_settings.values()]) max_ = max([x["scrambling_depths"][-1] for x in eval_settings.values()]) xticks = np.arange(min_, max_+1, max(np.ceil((max_-min_+1)/8).astype(int), 1)) for used_settings, (i, position) in zip(eval_settings.values(), enumerate(positions)): # Make sure axes are stored in a matrix, so they are easire to work with, and select axes object if len(eval_results) == 1: axes = np.array([[axes]]) elif len(eval_results) <= width and i == 0: axes = np.expand_dims(axes, 0) ax = axes[position] if position[1] == 0: ax.set_ylabel(f"Solution length") if position[0] == height - 1 or len(eval_results) <= width: ax.set_xlabel(f"Scrambling depth") ax.locator_params(axis="y", integer=True, tight=True) try: agent, results = agents[i], agent_results[i] assert type(agent) == str, str(type(agent)) ax.set_title(agent if axes.size > 1 else "Solution lengths for " + agent) results = [depth[depth != -1] for depth in results] ax.boxplot(results) ax.grid(True) except IndexError: pass ax.set_ylim(ylim) ax.set_xlim([used_settings["scrambling_depths"].min()-1, used_settings["scrambling_depths"].max()+1]) plt.setp(axes, xticks=xticks, xticklabels=[str(x) for x in xticks]) plt.rcParams.update(rc_params) if axes.size > 1: fig.suptitle("Solution lengths") fig.tight_layout() fig.subplots_adjust(top=0.88) path = os.path.join(save_dir, "eval_sollengths.png") plt.savefig(path) plt.clf() return path @classmethod def _time_states_winrate_plot(cls, eval_results: dict, eval_times_or_states: dict, is_times: bool, depth: int, save_dir: str, eval_settings: dict, colours: list) -> str: # Make a (time spent, winrate) plot if is_times else (states explored, winrate) # Only done for the deepest configuration plt.figure(figsize=(19.2, 10.8)) max_value = 0 for (agent, res), values, colour in zip(eval_results.items(), eval_times_or_states.values(), colours): sort_idcs = np.argsort(values.ravel()) # Use values from all different depths - mainly for deep evaluation wins, values = (res != -1).ravel()[sort_idcs], values.ravel()[sort_idcs] max_value = max(max_value, values.max()) cumulative_winrate = np.cumsum(wins) / len(wins) * 100 plt.plot(values, cumulative_winrate, "o-", linewidth=3, color=colour, label=agent) plt.xlabel("Time used [s]" if is_times else "States explored") plt.ylabel("Winrate [%]") plt.xlim([-0.05*max_value, 1.05*max_value]) plt.ylim([-5, 105]) plt.legend() plt.title(f"Winrate against {'time used for' if is_times else 'states seen during'} solving at depth {depth if depth else '100 - 999'}") plt.grid(True) plt.tight_layout() path = os.path.join(save_dir, "time_winrate.png" if is_times else "states_winrate.png") plt.savefig(path) plt.clf() return path @classmethod def _distribution_plots(cls, eval_results: dict, eval_times: dict, eval_states: dict, depth: int, save_dir: str, eval_settings: dict, colours: list) -> str: """Histograms of solution length, time used, and states explored for won games""" normal_pdf = lambda x, mu, sigma: np.exp(-1/2 * ((x-mu)/sigma)**2) / (sigma * np.sqrt(2*np.pi)) won_games = { agent: (res != -1).ravel() for agent, res in eval_results.items() } if all(w.sum() <= 1 for w in won_games.values()): return "ERROR" eval_results = { agent: res.ravel()[won_games[agent]] for agent, res in eval_results.items() if won_games[agent].sum() > 1 } eval_times = { agent: times.ravel()[won_games[agent]] for agent, times in eval_times.items() if won_games[agent].sum() > 1 } eval_states = { agent: states.ravel()[won_games[agent]] for agent, states in eval_states.items() if won_games[agent].sum() > 1 } eval_data = [eval_results, eval_times, eval_states] x_labels = ["Solution length", "Time used [s]", "States seen"] titles = ["Distribution of solution lengths for solved cubes", "Distribution of time used for solved cubes", "Distribution of states seen for solved cubes"] paths = [os.path.join(save_dir, x) + ".png" for x in ["solve_length_dist", "time_dist", "state_dist"]] paths_iter = iter(paths) for data, xlab, title, path in zip(eval_data, x_labels, titles, paths): plt.figure(figsize=(19.2, 10.8)) agents = list(data.keys()) values = [data[agent] for agent in agents] apply_to_values = lambda fun: fun([fun(v) for v in values]) mus, sigmas = np.array([v.mean() for v in values]), np.array([v.std() for v in values]) min_, max_ = apply_to_values(np.min), apply_to_values(np.max) if xlab == "Solution length": lower, higher = min_ - 2, max_ + 2 else: lower = min_ - (max_ - min_) * 0.1 higher = max_ + (max_ - min_) * 0.1 highest_y = 0 for i, (agent, v) in enumerate(zip(agents, values)): bins = np.arange(lower, higher+1) if xlab == "Solution length" else int(np.sqrt(len(v))*2) + 1 heights, _, _ = plt.hist(x=v, bins=bins, density=True, color=colours[i], edgecolor="black", linewidth=2, alpha=0.5, align="left" if xlab == "Solution length" else "mid", label=f"{agent}: {mus[i]:.2f}") highest_y = max(highest_y, np.max(heights)) if xlab == "Solution length": for i in range(len(data)): if sigmas[i] > 0: x = np.linspace(lower, higher, 1000) y = normal_pdf(x, mus[i], sigmas[i]) x = x[~np.isnan(y)] y = y[~np.isnan(y)] plt.plot(x, y, color="black", linewidth=9) plt.plot(x, y, color=colours[i], linewidth=5) highest_y = max(highest_y, y.max()) plt.xlim([lower, higher]) plt.ylim([0, highest_y*(1+0.1*max(3, len(eval_results)))]) # To make room for labels plt.xlabel(xlab) plt.ylabel("Frequency") plt.title(f"{title} at depth {depth if depth else '100 - 999'}") plt.legend() plt.savefig(next(paths_iter)) plt.clf() return paths @staticmethod def _get_a_value(obj: dict): """Returns a vaue from the object""" return obj[list(obj.keys())[0]] @staticmethod def check_equal_settings(eval_settings: dict): """Super simple looper just to hide the ugliness""" games, times = list(), list() for setting in eval_settings.values(): games.append(setting['max_time']) times.append(setting['n_games']) return games.count(games[0]) == len(games), times.count(times[0]) == len(times)
def __init__(self): self.action_queue = deque() self.tt = TickTock()
from datetime import timedelta import numpy as np from runtrain import options from librubiks.jobs import TrainJob from librubiks.utils import set_seeds, Logger, Parser, TickTock if __name__ == "__main__": set_seeds() parser = Parser( options, description="Estimate the amount of times required for given jobs", name="train") estimated_runtime = 0 tt = TickTock() job_settings = parser.parse(False) for settings in job_settings: job_rollouts = settings["rollouts"] job_evaluation_interval = settings["evaluation_interval"] settings[ "rollouts"] = 5 # Five rollouts should be good enough to give a decent estimate settings["evaluation_interval"] = 0 # Estimates training time tt.tick() train = TrainJob(**settings) train.execute() estimated_runtime += tt.tock() * job_rollouts / settings["rollouts"] # Estimates evaluation time evaluations = job_rollouts / job_evaluation_interval if job_evaluation_interval else 0 estimated_runtime += np.ceil(
import matplotlib.pyplot as plt import numpy as np import torch from librubiks import gpu, no_grad from librubiks import cube from librubiks.model import Model from librubiks.utils import TickTock, Logger tt = TickTock() log = Logger("data/local_analyses/net.log", "Analyzing MCTS") net = Model.load("data/local_method_comparison/asgerfix").eval().to(gpu) def _get_adi_ff_slices(b, n): slice_size = n // b + 1 # Final slice may have overflow, however this is simply ignored when indexing slices = [slice(i * slice_size, (i + 1) * slice_size) for i in range(b)] return slices def _ff(oh_states, value=True, policy=True): batches = 1 while True: try: value_parts = [net(oh_states[slice_], policy=policy, value=value).squeeze() for slice_ in _get_adi_ff_slices(batches, len(oh_states))] values = torch.cat(value_parts).cpu() break except RuntimeError as e: # Usually caused by running out of vram. If not, the error is still raised, else batch size is reduced if "alloc" not in str(e): raise e