Exemple #1
0
def test_tt():
    tt = TickTock()
    tt.profile("test0")
    sleep(.01)
    tt.profile("test1")
    sleep(.01)
    tt.end_profile("test1")
    sleep(.01)
    tt.end_profile("test0")
    assert np.isclose(0.03, tt.profiles["test0"].sum(), 1)
    assert np.isclose(0.01, tt.profiles["test1"].sum(), 1)
Exemple #2
0
 def _log_method_results(self, description: str, pname: str, divider=1):
     threshold = 2
     n = len(self.tt.profiles[pname])
     removed = self.tt.profiles[pname].remove_outliers(threshold)
     self.log("\n".join([
      description + ": " + TickTock.stringify_time(self.tt.profiles[pname].mean() / divider, TimeUnit.microsecond),
      "Mean: " + TickTock.stringify_time(self.tt.profiles[pname].mean(), TimeUnit.microsecond) + " p/m " +\
       TickTock.stringify_time(norm.ppf(0.975) * self.tt.profiles[pname].std() / np.sqrt(n-removed), TimeUnit.nanosecond),
      "Std.: " + TickTock.stringify_time(self.tt.profiles[pname].std(), TimeUnit.microsecond),
      f"Removed {TickTock.thousand_seps(removed)} outliers with threshold {threshold} * mean.",
      f"Mean and std. are based on the remaining {TickTock.thousand_seps(n-removed)} measurements",
     ]))
Exemple #3
0
class Agent:
	eps = np.finfo("float").eps
	_explored_states = 0

	def __init__(self):
		self.action_queue = deque()
		self.tt = TickTock()

	@no_grad
	def search(self, state: np.ndarray, time_limit: float=None, max_states: int=None) -> bool:
		# Returns whether a path was found and generates action queue
		# Implement _step method for agents that look one step ahead, otherwise overwrite this method
		time_limit, max_states = self.reset(time_limit, max_states)
		self.tt.tick()

		if cube.is_solved(state): return True
		while self.tt.tock() < time_limit and len(self) < max_states:
			action, state, solution_found = self._step(state)
			self.action_queue.append(action)
			if solution_found:
				self._explored_states = len(self.action_queue)
				return True

		self._explored_states = len(self.action_queue)
		return False


	def _step(self, state: np.ndarray) -> (int, np.ndarray, bool):
		"""
		Takes a step given a stae
		:param state: numpy array containing a state
		:return: Action index, new state, is solved
		"""
		raise NotImplementedError

	def reset(self, time_limit: float, max_states: int):
		self._explored_states = 0
		self.action_queue = deque()
		self.tt.reset()
		if hasattr(self, "net"): self.net.eval()
		assert time_limit or max_states
		time_limit = time_limit or 1e10
		max_states = max_states or int(1e10)
		return time_limit, max_states

	def __str__(self):
		raise NotImplementedError

	def __len__(self):
		# Returns number of states explored
		return self._explored_states
Exemple #4
0
def benchmark():
    log = Logger("data/local_analyses/benchmarks.log", "Benchmarks")
    tt = TickTock()
    cube_bench = CubeBench(log, tt)

    # Cube config variables
    cn = int(1e7)
    multi_op_size = int(1e4)  # Number of states used in multi operations

    store_repr()
    for repr_ in [True, False]:
        set_is2024(repr_)
        log.section(
            f"Benchmarking cube enviroment with {_repstr()} representation")
        tt.profile(f"Benchmarking cube environment, {_repstr()}")
        cube_bench.rotate(cn)
        cube_bench.multi_rotate(int(cn / multi_op_size), multi_op_size)
        cube_bench.onehot(cn)
        cube_bench.multi_onehot(int(cn / multi_op_size), multi_op_size)
        cube_bench.check_solution(cn)
        cube_bench.check_multi_solution(int(cn / multi_op_size), multi_op_size)
        tt.end_profile(f"Benchmarking cube environment, {_repstr()}")

    restore_repr()

    log.section("Benchmark runtime distribution")
    log(tt)
Exemple #5
0
	def __init__(self,
		         n_games,
		         scrambling_depths: range or list,
		         max_time = None,  # Max time to completion per game
		         max_states = None,  # The max number of states to explore per game
		         logger: Logger = NullLogger()
		):

		self.n_games = n_games
		self.max_time = max_time
		self.max_states = max_states

		self.tt = TickTock()
		self.log = logger
		# Use array of scrambling of scrambling depths if not deep evaluation else just a one element array with 0
		self.scrambling_depths = np.array(scrambling_depths) if scrambling_depths != range(0) else np.array([0])

		self.log("\n".join([
			"Creating evaluator",
			f"Games per scrambling depth: {self.n_games}",
			f"Scrambling depths: {scrambling_depths if self._isdeep() else 'Uniformly sampled in [100, 999]'}",
		]))
Exemple #6
0
	def __init__(self,
				 rollouts: int,
				 batch_size: int,  # Required to be > 1 when training with batchnorm
				 rollout_games: int,
				 rollout_depth: int,
				 optim_fn,
				 alpha_update: float,
				 lr: float,
				 gamma: float,
				 update_interval: int,
				 agent: DeepAgent,
				 evaluator: Evaluator,
				 evaluation_interval: int,
				 with_analysis: bool,
				 tau: float,
				 reward_method: str,
				 policy_criterion	= torch.nn.CrossEntropyLoss,
				 value_criterion	= torch.nn.MSELoss,
				 logger: Logger		= NullLogger(),
				 ):
		"""Sets up evaluation array, instantiates critera and stores and documents settings


		:param bool with_analysis: If true, a number of statistics relating to loss behaviour and model output are stored.
		:param float alpha_update: alpha <- alpha + alpha_update every update_interval rollouts (excl. rollout 0)
		:param float gamma: lr <- lr * gamma every update_interval rollouts (excl. rollout 0)
		:param float tau: How much of the new network to use to generate ADI data
		"""
		self.rollouts = rollouts
		self.train_rollouts = np.arange(self.rollouts)
		self.batch_size = self.states_per_rollout if not batch_size else batch_size
		self.rollout_games = rollout_games
		self.rollout_depth = rollout_depth
		self.adi_ff_batches = 1  # Number of batches used for feedforward in ADI_traindata. Used to limit vram usage
		self.reward_method = reward_method

		# Perform evaluation every evaluation_interval and after last rollout
		if evaluation_interval:
			self.evaluation_rollouts = np.arange(0, self.rollouts, evaluation_interval)-1
			if evaluation_interval == 1:
				self.evaluation_rollouts = self.evaluation_rollouts[1:]
			else:
				self.evaluation_rollouts[0] = 0
			if self.rollouts-1 != self.evaluation_rollouts[-1]:
				self.evaluation_rollouts = np.append(self.evaluation_rollouts, self.rollouts-1)
		else:
			self.evaluation_rollouts = np.array([])
		self.agent = agent

		self.tau = tau
		self.alpha_update = alpha_update
		self.lr	= lr
		self.gamma = gamma
		self.update_interval = update_interval  # How often alpha and lr are updated

		self.optim = optim_fn
		self.policy_criterion = policy_criterion(reduction='none')
		self.value_criterion = value_criterion(reduction='none')

		self.evaluator = evaluator
		self.log = logger
		self.log("\n".join([
			"Created trainer",
			f"Alpha update: {self.alpha_update:.2f}",
			f"Learning rate and gamma: {self.lr} and {self.gamma}",
			f"Learning rate and alpha will update every {self.update_interval} rollouts: lr <- {self.gamma:.4f} * lr and alpha += {self.alpha_update:.4f}"\
				if self.update_interval else "Learning rate and alpha will not be updated during training",
			f"Optimizer:      {self.optim}",
			f"Policy and value criteria: {self.policy_criterion} and {self.value_criterion}",
			f"Rollouts:       {self.rollouts}",
			f"Batch size:     {self.batch_size}",
			f"Rollout games:  {self.rollout_games}",
			f"Rollout depth:  {self.rollout_depth}",
			f"alpha update:   {self.alpha_update}",
		]))

		self.with_analysis = with_analysis
		if self.with_analysis:
			self.analysis = TrainAnalysis(self.evaluation_rollouts, self.rollout_games, self.rollout_depth, extra_evals=100, reward_method=reward_method, logger=self.log) #Logger should not be set in standard use

		self.tt = TickTock()
Exemple #7
0
class Train:

	states_per_rollout: int

	train_rollouts: np.ndarray
	value_losses: np.ndarray
	policy_losses: np.ndarray
	train_losses: np.ndarray
	sol_percents: list

	def __init__(self,
				 rollouts: int,
				 batch_size: int,  # Required to be > 1 when training with batchnorm
				 rollout_games: int,
				 rollout_depth: int,
				 optim_fn,
				 alpha_update: float,
				 lr: float,
				 gamma: float,
				 update_interval: int,
				 agent: DeepAgent,
				 evaluator: Evaluator,
				 evaluation_interval: int,
				 with_analysis: bool,
				 tau: float,
				 reward_method: str,
				 policy_criterion	= torch.nn.CrossEntropyLoss,
				 value_criterion	= torch.nn.MSELoss,
				 logger: Logger		= NullLogger(),
				 ):
		"""Sets up evaluation array, instantiates critera and stores and documents settings


		:param bool with_analysis: If true, a number of statistics relating to loss behaviour and model output are stored.
		:param float alpha_update: alpha <- alpha + alpha_update every update_interval rollouts (excl. rollout 0)
		:param float gamma: lr <- lr * gamma every update_interval rollouts (excl. rollout 0)
		:param float tau: How much of the new network to use to generate ADI data
		"""
		self.rollouts = rollouts
		self.train_rollouts = np.arange(self.rollouts)
		self.batch_size = self.states_per_rollout if not batch_size else batch_size
		self.rollout_games = rollout_games
		self.rollout_depth = rollout_depth
		self.adi_ff_batches = 1  # Number of batches used for feedforward in ADI_traindata. Used to limit vram usage
		self.reward_method = reward_method

		# Perform evaluation every evaluation_interval and after last rollout
		if evaluation_interval:
			self.evaluation_rollouts = np.arange(0, self.rollouts, evaluation_interval)-1
			if evaluation_interval == 1:
				self.evaluation_rollouts = self.evaluation_rollouts[1:]
			else:
				self.evaluation_rollouts[0] = 0
			if self.rollouts-1 != self.evaluation_rollouts[-1]:
				self.evaluation_rollouts = np.append(self.evaluation_rollouts, self.rollouts-1)
		else:
			self.evaluation_rollouts = np.array([])
		self.agent = agent

		self.tau = tau
		self.alpha_update = alpha_update
		self.lr	= lr
		self.gamma = gamma
		self.update_interval = update_interval  # How often alpha and lr are updated

		self.optim = optim_fn
		self.policy_criterion = policy_criterion(reduction='none')
		self.value_criterion = value_criterion(reduction='none')

		self.evaluator = evaluator
		self.log = logger
		self.log("\n".join([
			"Created trainer",
			f"Alpha update: {self.alpha_update:.2f}",
			f"Learning rate and gamma: {self.lr} and {self.gamma}",
			f"Learning rate and alpha will update every {self.update_interval} rollouts: lr <- {self.gamma:.4f} * lr and alpha += {self.alpha_update:.4f}"\
				if self.update_interval else "Learning rate and alpha will not be updated during training",
			f"Optimizer:      {self.optim}",
			f"Policy and value criteria: {self.policy_criterion} and {self.value_criterion}",
			f"Rollouts:       {self.rollouts}",
			f"Batch size:     {self.batch_size}",
			f"Rollout games:  {self.rollout_games}",
			f"Rollout depth:  {self.rollout_depth}",
			f"alpha update:   {self.alpha_update}",
		]))

		self.with_analysis = with_analysis
		if self.with_analysis:
			self.analysis = TrainAnalysis(self.evaluation_rollouts, self.rollout_games, self.rollout_depth, extra_evals=100, reward_method=reward_method, logger=self.log) #Logger should not be set in standard use

		self.tt = TickTock()


	def train(self, net: Model) -> (Model, Model):
		""" Training loop: generates data, optimizes parameters, evaluates (sometimes) and repeats.

		Trains `net` for `self.rollouts` rollouts each consisting of `self.rollout_games` games and scrambled  `self.rollout_depth`.
		The network is evaluated for each rollout number in `self.evaluations` according to `self.evaluator`.
		Stores multiple performance and training results.

		:param torch.nn.Model net: The network to be trained. Must accept input consistent with cube.get_oh_size()
		:return: The network after all evaluations and the network with the best evaluation score (win fraction)
		:rtype: (torch.nn.Model, torch.nn.Model)
		"""

		self.tt.reset()
		self.tt.tick()
		self.states_per_rollout = self.rollout_depth * self.rollout_games
		self.log(f"Beginning training. Optimization is performed in batches of {self.batch_size}")
		self.log("\n".join([
			f"Rollouts: {self.rollouts}",
			f"Each consisting of {self.rollout_games} games with a depth of {self.rollout_depth}",
			f"Evaluations: {len(self.evaluation_rollouts)}",
		]))
		best_solve = 0
		best_net = net.clone()
		self.agent.net = net
		if self.with_analysis:
			self.analysis.orig_params = net.get_params()

		generator_net = net.clone()

		alpha = 1 if self.alpha_update == 1 else 0
		optimizer = self.optim(net.parameters(), lr=self.lr)
		lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, self.gamma)
		self.policy_losses = np.zeros(self.rollouts)
		self.value_losses = np.zeros(self.rollouts)
		self.train_losses = np.empty(self.rollouts)
		self.sol_percents = list()

		for rollout in range(self.rollouts):
			reset_cuda()

			generator_net = self._update_gen_net(generator_net, net) if self.tau != 1 else net

			self.tt.profile("ADI training data")
			training_data, policy_targets, value_targets, loss_weights = self.ADI_traindata(generator_net, alpha)
			self.tt.profile("To cuda")
			training_data = training_data.to(gpu)
			policy_targets = policy_targets.to(gpu)
			value_targets = value_targets.to(gpu)
			loss_weights = loss_weights.to(gpu)
			self.tt.end_profile("To cuda")
			self.tt.end_profile("ADI training data")

			reset_cuda()

			self.tt.profile("Training loop")
			net.train()
			batches = self._get_batches(self.states_per_rollout, self.batch_size)
			for i, batch in enumerate(batches):
				optimizer.zero_grad()
				policy_pred, value_pred = net(training_data[batch], policy=True, value=True)

				# Use loss on both policy and value
				policy_loss = self.policy_criterion(policy_pred, policy_targets[batch]) * loss_weights[batch]
				value_loss = self.value_criterion(value_pred.squeeze(), value_targets[batch]) * loss_weights[batch]
				loss = torch.mean(policy_loss + value_loss)
				loss.backward()
				optimizer.step()
				self.policy_losses[rollout] += policy_loss.detach().cpu().numpy().mean() / len(batches)
				self.value_losses[rollout] += value_loss.detach().cpu().numpy().mean() / len(batches)

				if self.with_analysis: #Save policy output to compute entropy
					with torch.no_grad():
						self.analysis.rollout_policy.append(
							torch.nn.functional.softmax(policy_pred.detach(), dim=0).cpu().numpy()
						)

			self.train_losses[rollout] = (self.policy_losses[rollout] + self.value_losses[rollout])
			self.tt.end_profile("Training loop")

			# Updates learning rate and alpha
			if rollout and self.update_interval and rollout % self.update_interval == 0:
				if self.gamma != 1:
					lr_scheduler.step()
					lr = optimizer.param_groups[0]["lr"]
					self.log(f"Updated learning rate from {lr/self.gamma:.2e} to {lr:.2e}")
				if (alpha + self.alpha_update <= 1 or np.isclose(alpha + self.alpha_update, 1)) and self.alpha_update:
					alpha += self.alpha_update
					self.log(f"Updated alpha from {alpha-self.alpha_update:.2f} to {alpha:.2f}")
				elif alpha < 1 and alpha + self.alpha_update > 1 and self.alpha_update:
					self.log(f"Updated alpha from {alpha:.2f} to 1")
					alpha = 1

			if self.log.is_verbose() or rollout in (np.linspace(0, 1, 20)*self.rollouts).astype(int):
				self.log(f"Rollout {rollout} completed with mean loss {self.train_losses[rollout]}")

			if self.with_analysis:
				self.tt.profile("Analysis of rollout")
				self.analysis.rollout(net, rollout, value_targets)
				self.tt.end_profile("Analysis of rollout")

			if rollout in self.evaluation_rollouts:
				net.eval()

				self.agent.net = net
				self.tt.profile(f"Evaluating using agent {self.agent}")
				with unverbose:
					eval_results, _, _ = self.evaluator.eval(self.agent)
				eval_reward = (eval_results != -1).mean()
				self.sol_percents.append(eval_reward)
				self.tt.end_profile(f"Evaluating using agent {self.agent}")

				if eval_reward > best_solve:
					best_solve = eval_reward
					best_net = net.clone()
					self.log(f"Updated best net with solve rate {eval_reward*100:.2f} % at depth {self.evaluator.scrambling_depths}")

		self.log.section("Finished training")
		if len(self.evaluation_rollouts):
			self.log(f"Best net solves {best_solve*100:.2f} % of games at depth {self.evaluator.scrambling_depths}")
		self.log.verbose("Training time distribution")
		self.log.verbose(self.tt)
		total_time = self.tt.tock()
		eval_time = self.tt.profiles[f'Evaluating using agent {self.agent}'].sum() if len(self.evaluation_rollouts) else 0
		train_time = self.tt.profiles["Training loop"].sum()
		adi_time = self.tt.profiles["ADI training data"].sum()
		nstates = self.rollouts * self.rollout_games * self.rollout_depth * cube.action_dim
		states_per_sec = int(nstates / (adi_time+train_time))
		self.log("\n".join([
			f"Total running time:               {self.tt.stringify_time(total_time, TimeUnit.second)}",
			f"- Training data for ADI:          {self.tt.stringify_time(adi_time, TimeUnit.second)} or {adi_time/total_time*100:.2f} %",
			f"- Training time:                  {self.tt.stringify_time(train_time, TimeUnit.second)} or {train_time/total_time*100:.2f} %",
			f"- Evaluation time:                {self.tt.stringify_time(eval_time, TimeUnit.second)} or {eval_time/total_time*100:.2f} %",
			f"States witnessed incl. substates: {TickTock.thousand_seps(nstates)}",
			f"- Per training second:            {TickTock.thousand_seps(states_per_sec)}",
		]))

		return net, best_net

	def _get_adi_ff_slices(self):
		data_points = self.rollout_games * self.rollout_depth * cube.action_dim
		slice_size = data_points // self.adi_ff_batches + 1
		# Final slice may have overflow, however this is simply ignored when indexing
		slices = [slice(i*slice_size, (i+1)*slice_size) for i in range(self.adi_ff_batches)]
		return slices

	@no_grad
	def ADI_traindata(self, net, alpha: float):
		""" Training data generation

		Implements Autodidactic Iteration as per McAleer, Agostinelli, Shmakov and Baldi, "Solving the Rubik's Cube Without Human Knowledge" section 4.1
		Loss weighting is dependant on `self.loss_weighting`.

		:param torch.nn.Model net: The network used for generating the training data. This should according to ADI be the network from the last rollout.
		:param int rollout:  The current rollout number. Used in adaptive loss weighting.

		:return:  Games * sequence_length number of observations divided in four arrays
			- states contains the rubiks state for each data point
			- policy_targets and value_targets contains optimal value and policy targets for each training point
			- loss_weights contains the weight for each training point (see weighted samples subsection of McAleer et al paper)

		:rtype: (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor)

		"""
		net.eval()
		self.tt.profile("Scrambling")
		# Only include solved state in training if using Max Lapan convergence fix
		states, oh_states = cube.sequence_scrambler(self.rollout_games, self.rollout_depth, with_solved = self.reward_method == 'lapanfix')
		self.tt.end_profile("Scrambling")

		# Keeps track of solved states - Max Lapan's convergence fix
		solved_scrambled_states = cube.multi_is_solved(states)

		# Generates possible substates for all scrambled states. Shape: n_states*action_dim x *Cube_shape
		self.tt.profile("ADI substates")
		substates = cube.multi_rotate(np.repeat(states, cube.action_dim, axis=0), *cube.iter_actions(len(states)))
		self.tt.end_profile("ADI substates")
		self.tt.profile("One-hot encoding")
		substates_oh = cube.as_oh(substates)
		self.tt.end_profile("One-hot encoding")

		self.tt.profile("Reward")
		solved_substates = cube.multi_is_solved(substates)
		# Reward for won state is 1 normally but 0 if running with reward0
		rewards = (torch.zeros if self.reward_method == 'reward0' else torch.ones)\
			(*solved_substates.shape)
		rewards[~solved_substates] = -1
		self.tt.end_profile("Reward")

		# Generates policy and value targets
		self.tt.profile("ADI feedforward")
		while True:
			try:
				value_parts = [net(substates_oh[slice_], policy=False, value=True).squeeze() for slice_ in self._get_adi_ff_slices()]
				values = torch.cat(value_parts).cpu()
				break
			except RuntimeError as e:  # Usually caused by running out of vram. If not, the error is still raised, else batch size is reduced
				if "alloc" not in str(e):
					raise e
				self.log.verbose(f"Intercepted RuntimeError {e}\nIncreasing number of ADI feed forward batches from {self.adi_ff_batches} to {self.adi_ff_batches*2}")
				self.adi_ff_batches *= 2
		self.tt.end_profile("ADI feedforward")

		self.tt.profile("Calculating targets")
		values += rewards
		values = values.reshape(-1, 12)
		policy_targets = torch.argmax(values, dim=1)
		value_targets = values[np.arange(len(values)), policy_targets]
		if self.reward_method == 'lapanfix':
			# Trains on goal state, sets goalstate to 0
			value_targets[solved_scrambled_states] = 0
		elif self.reward_method == 'schultzfix':
			# Does not train on goal state, but sets first 12 substates to 0
			first_substates = np.zeros(len(states), dtype=bool)
			first_substates[np.arange(0, len(states), self.rollout_depth)] = True
			value_targets[first_substates] = 0

		self.tt.end_profile("Calculating targets")

		# Weighting examples according to alpha
		weighted = np.tile(1 / np.arange(1, self.rollout_depth+1), self.rollout_games)
		unweighted = np.ones_like(weighted)
		ws, us = weighted.sum(), len(unweighted)
		loss_weights = ((1-alpha) * weighted / ws + alpha * unweighted / us) * (ws + us)

		if self.with_analysis:
			self.tt.profile("ADI analysis")
			self.analysis.ADI(values)
			self.tt.end_profile("ADI analysis")
		return oh_states, policy_targets, value_targets, torch.from_numpy(loss_weights).float()

	def _update_gen_net(self, generator_net: Model, net: Model):
		"""Create a network with parameters weighted by self.tau"""
		self.tt.profile("Creating generator network")
		genparams, netparams = generator_net.state_dict(), net.state_dict()
		new_genparams = dict(genparams)
		for pname, param in netparams.items():
			new_genparams[pname].data.copy_(
					self.tau * param.data.to(gpu) + (1-self.tau) * new_genparams[pname].data.to(gpu)
					)
		generator_net.load_state_dict(new_genparams)
		self.tt.end_profile("Creating generator network")
		return generator_net.to(gpu)

	def plot_training(self, save_dir: str, name: str, semi_logy=False, show=False):
		"""
		Visualizes training by showing training loss + evaluation reward in same plot
		"""
		self.log("Making plot of training")
		fig, loss_ax = plt.subplots(figsize=(23, 10))

		colour = "red"
		loss_ax.set_ylabel("Training loss")
		loss_ax.plot(self.train_rollouts, self.train_losses,  linewidth=3,                        color=colour,   label="Training loss")
		loss_ax.plot(self.train_rollouts, self.policy_losses, linewidth=2, linestyle="dashdot",   color="orange", label="Policy loss")
		loss_ax.plot(self.train_rollouts, self.value_losses,  linewidth=2, linestyle="dashed",    color="green",  label="Value loss")
		loss_ax.tick_params(axis='y', labelcolor=colour)
		loss_ax.set_xlabel(f"Rollout, each of {TickTock.thousand_seps(self.states_per_rollout)} states")
		loss_ax.set_ylim(np.array([-0.05*1.35, 1.35]) * self.train_losses.max())
		h1, l1 = loss_ax.get_legend_handles_labels()

		if len(self.evaluation_rollouts):
			color = 'blue'
			reward_ax = loss_ax.twinx()
			reward_ax.set_ylim([-5, 105])
			reward_ax.set_ylabel("Solve rate (~95 % CI) [%]")
			sol_shares = np.array(self.sol_percents)
			bernoulli_errors = bernoulli_error(sol_shares, self.evaluator.n_games, alpha=0.05)
			reward_ax.errorbar(self.evaluation_rollouts, sol_shares*100, bernoulli_errors*100, fmt="-o",
				capsize=10, color=color, label="Policy performance", errorevery=2, alpha=0.8)
			reward_ax.tick_params(axis='y', labelcolor=color)
			h2, l2 = reward_ax.get_legend_handles_labels()
			h1 += h2
			l1 += l2
		loss_ax.legend(h1, l1, loc=2)

		title = (f"Training - {TickTock.thousand_seps(self.rollouts*self.rollout_games*self.rollout_depth)} states")
		plt.title(title)
		fig.tight_layout()
		if semi_logy: plt.semilogy()
		plt.grid(True)

		os.makedirs(save_dir, exist_ok=True)
		path = os.path.join(save_dir, f"training_{name}.png")
		plt.savefig(path)
		self.log(f"Saved loss and evaluation plot to {path}")

		if show: plt.show()
		plt.clf()

	@staticmethod
	def _get_batches(size: int, bsize: int):
		"""
		Generates indices for batch
		"""
		nbatches = int(np.ceil(size/bsize))
		idcs = np.arange(size)
		np.random.shuffle(idcs)
		batches = [slice(batch*bsize, (batch+1)*bsize) for batch in range(nbatches)]
		batches[-1] = slice(batches[-1].start, size)
		return batches
Exemple #8
0
class Evaluator:
	def __init__(self,
		         n_games,
		         scrambling_depths: range or list,
		         max_time = None,  # Max time to completion per game
		         max_states = None,  # The max number of states to explore per game
		         logger: Logger = NullLogger()
		):

		self.n_games = n_games
		self.max_time = max_time
		self.max_states = max_states

		self.tt = TickTock()
		self.log = logger
		# Use array of scrambling of scrambling depths if not deep evaluation else just a one element array with 0
		self.scrambling_depths = np.array(scrambling_depths) if scrambling_depths != range(0) else np.array([0])

		self.log("\n".join([
			"Creating evaluator",
			f"Games per scrambling depth: {self.n_games}",
			f"Scrambling depths: {scrambling_depths if self._isdeep() else 'Uniformly sampled in [100, 999]'}",
		]))

	def _isdeep(self):
		return self.scrambling_depths.size == 1 and self.scrambling_depths[0] == 0

	def approximate_time(self):
		return self.max_time * self.n_games * len(self.scrambling_depths)

	def _eval_game(self, agent: agents.Agent, depth: int, profile: str):
		turns_to_complete = -1  # -1 for unfinished
		state, _, _ = cube.scramble(depth, True)
		self.tt.profile(profile)
		solution_found = agent.search(state, self.max_time, self.max_states)
		dt = self.tt.end_profile(profile)
		if solution_found: turns_to_complete = len(agent.action_queue)
		return turns_to_complete, dt

	def eval(self, agent: agents.Agent) -> (np.ndarray, np.ndarray, np.ndarray):
		"""
		Evaluates an agent
		Returns results which is an a len(self.scrambling_depths) x self.n_games matrix
		Each entry contains the number of steps needed to solve the scrambled cube or -1 if not solved
		"""
		self.log.section(f"Evaluation of {agent}")
		self.log("\n".join([
			f"{self.n_games*len(self.scrambling_depths)} cubes",
			f"Maximum solve time per cube is {TickTock.stringify_time(self.max_time, TimeUnit.second)} "
			f"and estimated total time <= {TickTock.stringify_time(self.approximate_time(), TimeUnit.minute)}" if self.max_time else "No time limit given",
			f"Maximum number of explored states is {TickTock.thousand_seps(self.max_states)}" if self.max_states else "No max states given",
		]))
		
		res = []
		states = []
		times = []
		for d in self.scrambling_depths:
			for _ in range(self.n_games):
				if self._isdeep():  # Randomly sample evaluation depth for deep evaluations
					d = np.random.randint(100, 1000)
				p = f"Evaluation of {agent}. Depth {'100 - 999' if self._isdeep() else d}"
				r, dt = self._eval_game(agent, d, p)

				res.append(r)
				states.append(len(agent))
				times.append(dt)
			if not self._isdeep():
				self.log.verbose(f"Performed evaluation at depth: {d}/{self.scrambling_depths[-1]}")

		res = np.reshape(res, (len(self.scrambling_depths), self.n_games))
		states = np.reshape(states, (len(self.scrambling_depths), self.n_games))
		times = np.reshape(times, (len(self.scrambling_depths), self.n_games))

		self.log(f"Evaluation results")
		for i, d in enumerate(self.scrambling_depths):
			self.log_this_depth(res[i], states[i], times[i], d)

		self.log.verbose(f"Evaluation runtime\n{self.tt}")

		return res, states, times

	def log_this_depth(self, res: np.ndarray, states: np.ndarray, times: np.ndarray, depth: int):
		"""Logs summary statistics for given depth

		:param res:  Vector of results
		:param states: Vector of seen states for each game
		:param times: Vector of runtimes for each game
		:param depth:  Scrambling depth at which results were generated
		"""
		share_completed = np.count_nonzero(res!=-1)*100/len(res)
		won_games = res[res!=-1]
		self.log(f"Scrambling depth {depth if depth else 'deep'}", with_timestamp=False)
		self.log(
			f"\tShare completed: {share_completed:.2f} % {bernoulli_error(share_completed/100, len(res), 0.05, stringify=True)} (approx. 95 % CI)",
			with_timestamp=False
		)
		if won_games.size:
			mean_turns = won_games.mean()
			median_turns = np.median(won_games)
			std_turns = won_games.std()
			self.log(
				f"\tTurns to win: {mean_turns:.2f} +/- {std_turns:.1f} (std.), Median: {median_turns:.0f}",
				with_timestamp=False
			)

		safe_times = times != 0
		states_per_sec = states[safe_times] / times[safe_times]
		self.log(
			f"\tStates seen: Pr. game: {states.mean():.2f} +/- {states.std():.0f} (std.), "\
			f"Pr. sec.: {states_per_sec.mean():.2f} +/- {states_per_sec.std():.0f} (std.)", with_timestamp=False)
		self.log(f"\tTime:  {times.mean():.2f} +/- {times.std():.2f} (std.)", with_timestamp=False)

	@classmethod
	def plot_evaluators(cls, eval_results: dict, eval_states: dict, eval_times: dict, eval_settings: dict, save_dir: str, title: str='') -> list:
		"""
		Plots evaluation results
		:param eval_results:   { agent name: [steps to solve, -1 for unfinished] }
		:param eval_states:    { agent name: [states seen during solving] }
		:param eval_times:     { agent name: [time spent solving] }
		:param eval_settings:  { agent name: { 'n_games': int, 'max_time': float, 'max_states': int, 'scrambling_depths': np.ndarray } }
		:param save_dir:       Directory in which to save plots
		:param title:          If given, overrides auto generated title in (depth, winrate) plot
		:return:               Locations of saved plots
		"""
		assert eval_results.keys() == eval_results.keys() == eval_times.keys() == eval_settings.keys(), "Keys of evaluation dictionaries should match"
		os.makedirs(save_dir, exist_ok=True)

		tab_colours = list(mcolour.TABLEAU_COLORS)
		colours = [tab_colours[i%len(tab_colours)] for i in range(len(eval_results))]

		save_paths = [
			cls._plot_depth_win(eval_results, save_dir, eval_settings, colours, title),
			cls._sol_length_boxplots(eval_results, save_dir, eval_settings, colours),
		]
		# Only plot (time, winrate), (states, winrate), and their distributions if settings are the same
		if all(cls.check_equal_settings(eval_settings)):
			d = cls._get_a_value(eval_settings)["scrambling_depths"][-1]
			save_paths.extend([
				cls._time_states_winrate_plot(eval_results, eval_times, True, d, save_dir, eval_settings, colours),
				cls._time_states_winrate_plot(eval_results, eval_states, False, d, save_dir, eval_settings, colours),
			])
			p = cls._distribution_plots(eval_results, eval_times, eval_states, d, save_dir, eval_settings, colours)
			if p != "ERROR":
				save_paths.extend(p)

		return save_paths
	
	@classmethod
	def _plot_depth_win(cls, eval_results: dict, save_dir: str, eval_settings: dict, colours: list, title: str='') -> str:
		# depth, win%-graph
		games_equal, times_equal = cls.check_equal_settings(eval_settings)
		fig, ax = plt.subplots(figsize=(19.2, 10.8))
		ax.set_ylabel(f"Percentage of {cls._get_a_value(eval_settings)['n_games']} games won" if games_equal else "Percentage of games won")
		ax.set_xlabel(f"Scrambling depth: Number of random rotations applied to cubes")
		ax.locator_params(axis='x', integer=True, tight=True)

		for i, (agent, results) in enumerate(eval_results.items()):
			used_settings = eval_settings[agent]
			color = colours[i]
			win_percentages = (results != -1).mean(axis=1) * 100

			ax.plot(used_settings['scrambling_depths'], win_percentages, linestyle='dashdot', color=color)
			ax.scatter(used_settings['scrambling_depths'], win_percentages, color=color, label=agent)
		ax.legend()
		ax.set_ylim([-5, 105])
		ax.grid(True)
		ax.set_title(title if title else (f"Percentage of cubes solved in {cls._get_a_value(eval_settings)['max_time']:.2f} seconds" if times_equal else "Cubes solved"))
		fig.tight_layout()

		path = os.path.join(save_dir, "eval_winrates.png")
		plt.savefig(path)
		plt.clf()

		return path

	@classmethod
	def _sol_length_boxplots(cls, eval_results: dict, save_dir: str, eval_settings: dict, colours: list) -> str:
		# Solution length boxplots
		plt.rcParams.update(rc_params_small)
		max_width = 2
		width = min(len(eval_results), max_width)
		height = (len(eval_results)+1) // width if width == max_width else 1
		positions = [(i, j) for i in range(height) for j in range(width)]
		fig, axes = plt.subplots(height, width, figsize=(width*10, height*6))

		max_sollength = 50
		agents, agent_results = list(zip(*eval_results.items()))
		agent_results = tuple(x.copy() for x in agent_results)
		for res in agent_results:
			res[res > max_sollength] = max_sollength
		ylim = np.array([-0.02, 1.02]) * max([res.max() for res in agent_results])
		min_ = min([x["scrambling_depths"][0] for x in eval_settings.values()])
		max_ = max([x["scrambling_depths"][-1] for x in eval_settings.values()])
		xticks = np.arange(min_, max_+1, max(np.ceil((max_-min_+1)/8).astype(int), 1))
		for used_settings, (i, position) in zip(eval_settings.values(), enumerate(positions)):
			# Make sure axes are stored in a matrix, so they are easire to work with, and select axes object
			if len(eval_results) == 1:
				axes = np.array([[axes]])
			elif len(eval_results) <= width and i == 0:
				axes = np.expand_dims(axes, 0)
			ax = axes[position]
			if position[1] == 0:
				ax.set_ylabel(f"Solution length")
			if position[0] == height - 1 or len(eval_results) <= width:
				ax.set_xlabel(f"Scrambling depth")
			ax.locator_params(axis="y", integer=True, tight=True)

			try:
				agent, results = agents[i], agent_results[i]
				assert type(agent) == str, str(type(agent))
				ax.set_title(agent if axes.size > 1 else "Solution lengths for " + agent)
				results = [depth[depth != -1] for depth in results]
				ax.boxplot(results)
				ax.grid(True)
			except IndexError:
				pass
			ax.set_ylim(ylim)
			ax.set_xlim([used_settings["scrambling_depths"].min()-1, used_settings["scrambling_depths"].max()+1])

		plt.setp(axes, xticks=xticks, xticklabels=[str(x) for x in xticks])
		plt.rcParams.update(rc_params)
		if axes.size > 1:
			fig.suptitle("Solution lengths")
		fig.tight_layout()
		fig.subplots_adjust(top=0.88)
		path = os.path.join(save_dir, "eval_sollengths.png")
		plt.savefig(path)
		plt.clf()

		return path

	@classmethod
	def _time_states_winrate_plot(cls, eval_results: dict, eval_times_or_states: dict, is_times: bool,
	                              depth: int, save_dir: str, eval_settings: dict, colours: list) -> str:
		# Make a (time spent, winrate) plot if is_times else (states explored, winrate)
		# Only done for the deepest configuration
		plt.figure(figsize=(19.2, 10.8))
		max_value = 0
		for (agent, res), values, colour in zip(eval_results.items(), eval_times_or_states.values(), colours):
			sort_idcs = np.argsort(values.ravel())  # Use values from all different depths - mainly for deep evaluation
			wins, values = (res != -1).ravel()[sort_idcs], values.ravel()[sort_idcs]
			max_value = max(max_value, values.max())
			cumulative_winrate = np.cumsum(wins) / len(wins) * 100
			plt.plot(values, cumulative_winrate, "o-", linewidth=3, color=colour, label=agent)
		plt.xlabel("Time used [s]" if is_times else "States explored")
		plt.ylabel("Winrate [%]")
		plt.xlim([-0.05*max_value, 1.05*max_value])
		plt.ylim([-5, 105])
		plt.legend()
		plt.title(f"Winrate against {'time used for' if is_times else 'states seen during'} solving at depth {depth if depth else '100 - 999'}")
		plt.grid(True)
		plt.tight_layout()
		path = os.path.join(save_dir, "time_winrate.png" if is_times else "states_winrate.png")
		plt.savefig(path)
		plt.clf()
		
		return path
		
	@classmethod
	def _distribution_plots(cls, eval_results: dict, eval_times: dict, eval_states: dict, depth: int,
	                        save_dir: str, eval_settings: dict, colours: list) -> str:
		"""Histograms of solution length, time used, and states explored for won games"""

		normal_pdf = lambda x, mu, sigma: np.exp(-1/2 * ((x-mu)/sigma)**2) / (sigma * np.sqrt(2*np.pi))

		won_games    = { agent: (res != -1).ravel() for agent, res in eval_results.items() }
		if all(w.sum() <= 1 for w in won_games.values()):
			return "ERROR"
		eval_results = { agent: res.ravel()[won_games[agent]]    for agent, res    in eval_results.items() if won_games[agent].sum() > 1 }
		eval_times   = { agent: times.ravel()[won_games[agent]]  for agent, times  in eval_times.items()   if won_games[agent].sum() > 1 }
		eval_states  = { agent: states.ravel()[won_games[agent]] for agent, states in eval_states.items()  if won_games[agent].sum() > 1 }

		eval_data    = [eval_results, eval_times, eval_states]
		x_labels     = ["Solution length", "Time used [s]", "States seen"]
		titles       = ["Distribution of solution lengths for solved cubes",
		                "Distribution of time used for solved cubes",
						"Distribution of states seen for solved cubes"]
		paths        = [os.path.join(save_dir, x) + ".png" for x in ["solve_length_dist", "time_dist", "state_dist"]]
		paths_iter   = iter(paths)

		for data, xlab, title, path in zip(eval_data, x_labels, titles, paths):
			plt.figure(figsize=(19.2, 10.8))
			agents = list(data.keys())
			values = [data[agent] for agent in agents]
			apply_to_values = lambda fun: fun([fun(v) for v in values])
			mus, sigmas = np.array([v.mean() for v in values]), np.array([v.std() for v in values])
			min_, max_ = apply_to_values(np.min), apply_to_values(np.max)
			if xlab == "Solution length":
				lower, higher = min_ - 2, max_ + 2
			else:
				lower = min_ - (max_ - min_) * 0.1
				higher = max_ + (max_ - min_) * 0.1
			highest_y = 0
			for i, (agent, v) in enumerate(zip(agents, values)):
				bins = np.arange(lower, higher+1) if xlab == "Solution length" else int(np.sqrt(len(v))*2) + 1
				heights, _, _ = plt.hist(x=v, bins=bins, density=True, color=colours[i], edgecolor="black", linewidth=2,
				                         alpha=0.5, align="left" if xlab == "Solution length" else "mid", label=f"{agent}: {mus[i]:.2f}")
				highest_y = max(highest_y, np.max(heights))
			if xlab == "Solution length":
				for i in range(len(data)):
					if sigmas[i] > 0:
						x = np.linspace(lower, higher, 1000)
						y = normal_pdf(x, mus[i], sigmas[i])
						x = x[~np.isnan(y)]
						y = y[~np.isnan(y)]
						plt.plot(x, y, color="black", linewidth=9)
						plt.plot(x, y, color=colours[i], linewidth=5)
						highest_y = max(highest_y, y.max())
			plt.xlim([lower, higher])
			plt.ylim([0, highest_y*(1+0.1*max(3, len(eval_results)))])  # To make room for labels
			plt.xlabel(xlab)
			plt.ylabel("Frequency")
			plt.title(f"{title} at depth {depth if depth else '100 - 999'}")
			plt.legend()
			plt.savefig(next(paths_iter))
			plt.clf()

		return paths
	
	@staticmethod
	def _get_a_value(obj: dict):
		"""Returns a vaue from the object"""
		return obj[list(obj.keys())[0]]

	@staticmethod
	def check_equal_settings(eval_settings: dict):
		"""Super simple looper just to hide the ugliness"""
		games, times = list(), list()
		for setting in eval_settings.values():
			games.append(setting['max_time'])
			times.append(setting['n_games'])
		return games.count(games[0]) == len(games), times.count(times[0]) == len(times)
Exemple #9
0
	def __init__(self):
		self.action_queue = deque()
		self.tt = TickTock()
Exemple #10
0
from datetime import timedelta

import numpy as np

from runtrain import options
from librubiks.jobs import TrainJob
from librubiks.utils import set_seeds, Logger, Parser, TickTock

if __name__ == "__main__":
    set_seeds()
    parser = Parser(
        options,
        description="Estimate the amount of times required for given jobs",
        name="train")
    estimated_runtime = 0
    tt = TickTock()
    job_settings = parser.parse(False)
    for settings in job_settings:
        job_rollouts = settings["rollouts"]
        job_evaluation_interval = settings["evaluation_interval"]
        settings[
            "rollouts"] = 5  # Five rollouts should be good enough to give a decent estimate
        settings["evaluation_interval"] = 0
        # Estimates training time
        tt.tick()
        train = TrainJob(**settings)
        train.execute()
        estimated_runtime += tt.tock() * job_rollouts / settings["rollouts"]
        # Estimates evaluation time
        evaluations = job_rollouts / job_evaluation_interval if job_evaluation_interval else 0
        estimated_runtime += np.ceil(
Exemple #11
0
import matplotlib.pyplot as plt
import numpy as np
import torch

from librubiks import gpu, no_grad
from librubiks import cube
from librubiks.model import Model
from librubiks.utils import TickTock, Logger

tt = TickTock()
log = Logger("data/local_analyses/net.log", "Analyzing MCTS")
net = Model.load("data/local_method_comparison/asgerfix").eval().to(gpu)


def _get_adi_ff_slices(b, n):
	slice_size = n // b + 1
	# Final slice may have overflow, however this is simply ignored when indexing
	slices = [slice(i * slice_size, (i + 1) * slice_size) for i in range(b)]
	return slices

def _ff(oh_states, value=True, policy=True):
	batches = 1
	while True:
		try:
			value_parts = [net(oh_states[slice_], policy=policy, value=value).squeeze() for slice_ in
						   _get_adi_ff_slices(batches, len(oh_states))]
			values = torch.cat(value_parts).cpu()
			break
		except RuntimeError as e:  # Usually caused by running out of vram. If not, the error is still raised, else batch size is reduced
			if "alloc" not in str(e):
				raise e