Example #1
0
 def test_iter_actions(self):
     actions = np.array([
         [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5] * 2,
         [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0] * 2,
     ],
                        dtype=np.uint8)
     assert np.all(actions == cube.iter_actions(2))
Example #2
0
	def _step(self, state: np.ndarray) -> (int, np.ndarray, bool):
		substates = cube.multi_rotate(cube.repeat_state(state, cube.action_dim), *cube.iter_actions())
		solutions = cube.multi_is_solved(substates)
		if np.any(solutions):
			action = np.where(solutions)[0][0]
			return action, substates[action], True
		else:
			substates_oh = cube.as_oh(substates)
			v = self.net(substates_oh, policy=False).squeeze().cpu().numpy()
			action = np.argmax(v)
			return action, substates[action], False
Example #3
0
	def _complete_graph(self):
		"""
		Ensures that the graph is complete by expanding around all leaves and updating neighbors
		"""
		self.tt.profile("Complete graph")
		leaves_idcs = np.where(self.leaves[:len(self)+1])[0][1:]
		actions_taken = np.tile(np.arange(cube.action_dim), len(leaves_idcs))
		repeated_leaves_idcs = np.repeat(leaves_idcs, cube.action_dim)
		substates = cube.multi_rotate(self.states[repeated_leaves_idcs], *cube.iter_actions(len(leaves_idcs)))
		substate_strs = [s.tostring() for s in substates]
		substate_idcs = np.array([self.indices[s] if s in self.indices else 0 for s in substate_strs])
		self.neighbors[repeated_leaves_idcs, actions_taken] = substate_idcs
		self.neighbors[substate_idcs, cube.rev_actions(actions_taken)] = repeated_leaves_idcs
		self.neighbors[0] = 0
		self.tt.end_profile("Complete graph")
Example #4
0
	def __init__(self,
				 evaluations: np.ndarray,
				 games: int,
				 depth: int,
				 extra_evals: int,
				 reward_method: str,
				 logger: Logger = NullLogger()):
		"""Initialize containers mostly

		:param np.ndarray evaluations:  array of the evaluations performed on the model. Used for the more intensive analysis
		:param int depth: Rollout depth
		:param extra_evals: If != 0, extra evaluations are added for the first `exta_evals` rollouts

		"""

		self.games = games
		self.depth = depth
		self.depths = np.arange(depth)
		self.extra_evals = min(evaluations[-1] if len(evaluations) else 0, extra_evals) #Wont add evals in the future (or if no evals are needed)
		self.evaluations = np.unique( np.append(evaluations, range( self.extra_evals )) )
		self.reward_method = reward_method

		self.orig_params = None
		self.params = None

		self.first_states = np.stack((
				cube.get_solved(),
				*cube.multi_rotate(cube.repeat_state(cube.get_solved(), cube.action_dim), *cube.iter_actions())
				))
		self.first_states = cube.as_oh( self.first_states )
		self.first_state_values = list()

		self.substate_val_stds = list()

		self.avg_value_targets = list()
		self.param_changes = list()
		self.param_total_changes = list()

		self.policy_entropies = list()
		self.rollout_policy = list()

		self.log = logger
		self.log.verbose(f"Analysis of this training was enabled. Extra analysis is done for evaluations and for first {extra_evals} rollouts")
Example #5
0
	def ADI_traindata(self, net, alpha: float):
		""" Training data generation

		Implements Autodidactic Iteration as per McAleer, Agostinelli, Shmakov and Baldi, "Solving the Rubik's Cube Without Human Knowledge" section 4.1
		Loss weighting is dependant on `self.loss_weighting`.

		:param torch.nn.Model net: The network used for generating the training data. This should according to ADI be the network from the last rollout.
		:param int rollout:  The current rollout number. Used in adaptive loss weighting.

		:return:  Games * sequence_length number of observations divided in four arrays
			- states contains the rubiks state for each data point
			- policy_targets and value_targets contains optimal value and policy targets for each training point
			- loss_weights contains the weight for each training point (see weighted samples subsection of McAleer et al paper)

		:rtype: (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor)

		"""
		net.eval()
		self.tt.profile("Scrambling")
		# Only include solved state in training if using Max Lapan convergence fix
		states, oh_states = cube.sequence_scrambler(self.rollout_games, self.rollout_depth, with_solved = self.reward_method == 'lapanfix')
		self.tt.end_profile("Scrambling")

		# Keeps track of solved states - Max Lapan's convergence fix
		solved_scrambled_states = cube.multi_is_solved(states)

		# Generates possible substates for all scrambled states. Shape: n_states*action_dim x *Cube_shape
		self.tt.profile("ADI substates")
		substates = cube.multi_rotate(np.repeat(states, cube.action_dim, axis=0), *cube.iter_actions(len(states)))
		self.tt.end_profile("ADI substates")
		self.tt.profile("One-hot encoding")
		substates_oh = cube.as_oh(substates)
		self.tt.end_profile("One-hot encoding")

		self.tt.profile("Reward")
		solved_substates = cube.multi_is_solved(substates)
		# Reward for won state is 1 normally but 0 if running with reward0
		rewards = (torch.zeros if self.reward_method == 'reward0' else torch.ones)\
			(*solved_substates.shape)
		rewards[~solved_substates] = -1
		self.tt.end_profile("Reward")

		# Generates policy and value targets
		self.tt.profile("ADI feedforward")
		while True:
			try:
				value_parts = [net(substates_oh[slice_], policy=False, value=True).squeeze() for slice_ in self._get_adi_ff_slices()]
				values = torch.cat(value_parts).cpu()
				break
			except RuntimeError as e:  # Usually caused by running out of vram. If not, the error is still raised, else batch size is reduced
				if "alloc" not in str(e):
					raise e
				self.log.verbose(f"Intercepted RuntimeError {e}\nIncreasing number of ADI feed forward batches from {self.adi_ff_batches} to {self.adi_ff_batches*2}")
				self.adi_ff_batches *= 2
		self.tt.end_profile("ADI feedforward")

		self.tt.profile("Calculating targets")
		values += rewards
		values = values.reshape(-1, 12)
		policy_targets = torch.argmax(values, dim=1)
		value_targets = values[np.arange(len(values)), policy_targets]
		if self.reward_method == 'lapanfix':
			# Trains on goal state, sets goalstate to 0
			value_targets[solved_scrambled_states] = 0
		elif self.reward_method == 'schultzfix':
			# Does not train on goal state, but sets first 12 substates to 0
			first_substates = np.zeros(len(states), dtype=bool)
			first_substates[np.arange(0, len(states), self.rollout_depth)] = True
			value_targets[first_substates] = 0

		self.tt.end_profile("Calculating targets")

		# Weighting examples according to alpha
		weighted = np.tile(1 / np.arange(1, self.rollout_depth+1), self.rollout_games)
		unweighted = np.ones_like(weighted)
		ws, us = weighted.sum(), len(unweighted)
		loss_weights = ((1-alpha) * weighted / ws + alpha * unweighted / us) * (ws + us)

		if self.with_analysis:
			self.tt.profile("ADI analysis")
			self.analysis.ADI(values)
			self.tt.end_profile("ADI analysis")
		return oh_states, policy_targets, value_targets, torch.from_numpy(loss_weights).float()
Example #6
0
	def expand_leaf(self, visited_states_idcs: list, actions_taken: list) -> (int, int):
		"""
		Expands around the given leaf and updates V and W in all visited_states_idcs
		Returns the action taken to solve the cube. -1 if no solution is found
		:param visited_states_idcs: List of states that have been visited including the starting node. Length n
		:param actions_taken: List of actions taken from starting state. Length n-1
		:return: The index of the leaf that is the solution and the action that must be taken from leaf_index.
			Both are 0 if solution is not found
		"""
		if len(self) + cube.action_dim > len(self.states):
			self.increase_stack_size()

		leaf_index = visited_states_idcs[-1]
		solve_leaf, solve_action = -1, -1

		self.tt.profile("Get substates")
		state = self.states[leaf_index]
		substates = cube.multi_rotate(cube.repeat_state(state), *cube.iter_actions())
		self.tt.end_profile("Get substates")

		# Check what states have been seen already
		substate_strs = [s.tostring() for s in substates]  # Unique identifier for each substate
		get_substate_strs = lambda bools: [s for s, b in zip(substate_strs, bools) if b]  # Shitty way to easily index into list with boolean array
		seen_substates = np.array([s in self.indices for s in substate_strs])  # States already in the graph
		unseen_substates = ~seen_substates  # States not already in the graph

		self.tt.profile("Update indices and states")
		new_states_idcs = len(self) + np.arange(unseen_substates.sum()) + 1
		new_idcs_dict = { s: i for i, s in zip(new_states_idcs, get_substate_strs(unseen_substates)) }
		self.indices.update(new_idcs_dict)
		substate_idcs = np.array([self.indices[s] for s in substate_strs])
		new_substate_idcs = substate_idcs[unseen_substates]
		new_substates = substates[unseen_substates]
		self.states[new_substate_idcs] = new_substates
		self.tt.end_profile("Update indices and states")

		self.tt.profile("Update neigbors and leaf status")
		actions = np.arange(cube.action_dim)
		self.neighbors[leaf_index, actions] = substate_idcs
		self.neighbors[substate_idcs, cube.rev_actions(actions)] = leaf_index
		self.leaves[leaf_index] = False
		self.tt.end_profile("Update neigbors and leaf status")

		self.tt.profile("Check for solution")
		solved_substate = np.where(cube.multi_is_solved(substates))[0]
		if solved_substate.size:
			solve_leaf = substate_idcs[solved_substate[0]]
			solve_action = solved_substate[0]
		self.tt.end_profile("Check for solution")

		# Update policy, value, and W
		self.tt.profile("One-hot encoding")
		new_substates_oh = cube.as_oh(new_substates)
		self.tt.end_profile("One-hot encoding")
		self.tt.profile("Feedforward")
		p, v = self.net(new_substates_oh)
		p, v = p.cpu().softmax(dim=1).numpy(), v.cpu().numpy().squeeze()
		self.tt.end_profile("Feedforward")

		self.tt.profile("Update P, V, and W")
		self.P[new_substate_idcs] = p
		self.V[new_substate_idcs] = v

		best_substate_v = v.max()
		self.W[leaf_index] = self.V[self.neighbors[leaf_index]]
		self.W[new_substate_idcs] = np.tile(v, (cube.action_dim, 1)).T
		self.W[visited_states_idcs[:-1], actions_taken] = np.maximum(self.W[visited_states_idcs[:-1], actions_taken], best_substate_v)
		self.tt.end_profile("Update P, V, and W")

		# Update N and L
		self.tt.profile("Update N and L")
		if actions_taken:  # Crashes if actions_taken is empty, which happens on the first run
			self.N[visited_states_idcs[:-1], actions_taken] += 1
			self.L[visited_states_idcs[:-1], actions_taken] = 0
			self.L[visited_states_idcs[1:], cube.rev_actions(np.array(actions_taken))] = 0
		self.tt.end_profile("Update N and L")

		return solve_leaf, solve_action
Example #7
0
	def expand_batch(self, expand_idcs: np.ndarray) -> bool:
		"""
		Expands to the neighbors of each of the states in
		Loose pseudo code:
		```
		1. Calculate children for all the batched expansion states
		2. Check which children are seen and not seen
		3. FOR the unseen
			IF they are the goal state: RETURN TRUE
			Set the state as their parent and set their G
			Calculate their H and add to open-list with correct cost
		4. RELAX(seen) #See psudeo code under `relax_seen_states`
		5. RETURN FALSE
		```

		:param expand_idcs: Indices corresponding to states in `self.states` of states from which to expand
		:return: True iff. solution was found in this expansion
		"""
		expand_size = len(expand_idcs)
		while len(self) + expand_size * cube.action_dim > len(self.states):
			self.increase_stack_size()

		self.tt.profile("Calculate substates")
		parent_idcs = np.repeat(expand_idcs, cube.action_dim, axis=0)
		substates = cube.multi_rotate(
			self.states[parent_idcs],
			*cube.iter_actions(expand_size)
		)
		actions_taken = np.tile(np.arange(cube.action_dim), expand_size)
		self.tt.end_profile("Calculate substates")

		self.tt.profile("Find new substates")
		substate_strs = [s.tostring() for s in substates]
		get_substate_strs = lambda bools: [s for s, b in zip(substate_strs, bools) if b]
		seen_substates = np.array([s in self.indices for s in substate_strs])
		unseen_substates = ~seen_substates
			# Handle duplicates
		first_occurences	= np.zeros(len(substate_strs), dtype=bool)
		_, first_indeces	= np.unique(substate_strs, return_index=True)
		first_occurences[first_indeces] = True
		first_seen			= first_occurences & seen_substates
		first_unseen		= first_occurences & unseen_substates
		self.tt.end_profile("Find new substates")

		self.tt.profile("Add substates to data structure")
		new_states			= substates[first_unseen]
		new_states_idcs		= len(self) + np.arange(first_unseen.sum()) + 1
		new_idcs_dict		= { s: i for i, s in zip(new_states_idcs, get_substate_strs(first_unseen)) }
		self.indices.update(new_idcs_dict)
		substate_idcs		= np.array([self.indices[s] for s in substate_strs])
		old_states_idcs		= substate_idcs[first_seen]

		self.states[new_states_idcs] = substates[first_unseen]
		self.tt.end_profile("Add substates to data structure")

		self.tt.profile("Update new state values")
		new_parent_idcs = parent_idcs[first_unseen]
		self.G[new_states_idcs] = self.G[new_parent_idcs] + 1
		self.parent_actions[new_states_idcs] = actions_taken[first_unseen]
		self.parents[new_states_idcs] = new_parent_idcs
			# Add the new states to "open" priority queue
		costs = self.cost(new_states, new_states_idcs)
		for i, cost in enumerate(costs):
			heapq.heappush(self.open_queue, (cost, new_states_idcs[i]))
		self.tt.end_profile("Update new state values")

		self.tt.profile("Check whether won")
		solved_substates = cube.multi_is_solved(new_states)
		if solved_substates.any():
			return True
		self.tt.end_profile("Check whether won")

		self.tt.profile("Old states: Update parents and G")
		seen_batch_idcs = np.where(first_seen) #Old idcs corresponding to first_seen
		self.relax_seen_states( old_states_idcs, parent_idcs[seen_batch_idcs], actions_taken[seen_batch_idcs] )
		self.tt.end_profile("Old states: Update parents and G")

		return False