Example #1
0
	def expand(self, state: np.ndarray) -> (list, np.ndarray, torch.tensor, tuple):
		# Initialize needed data structures
		states = cube.repeat_state(state, self.workers)
		states_oh = cube.as_oh(states)
		paths = paths = np.empty((self.workers, self.depth), dtype=int)  # Index n contains path for worker n
		new_states = np.empty((self.workers * self.depth, *cube.shape()), dtype=cube.dtype)
		new_states_oh = torch.empty(self.workers * self.depth, cube.get_oh_shape(), dtype=torch.float, device=gpu)
		# Expand for self.depth iterations
		for d in range(self.depth):
			# Use epsilon-greedy to decide where to use policy and random actions
			use_random = np.random.choice(2, self.workers, p=[1-self.epsilon, self.epsilon]).astype(bool)
			use_policy = ~use_random
			actions = np.empty(self.workers, dtype=int)
			# Random actions
			actions[use_random] = np.random.randint(0, cube.action_dim, use_random.sum())
			# Policy actions
			p = self.net(states_oh[use_policy], value=False).cpu().numpy()
			actions[use_policy] = p.argmax(axis=1)
			# Update paths
			paths[:, d] = actions

			# Expand using selected actions
			faces, dirs = cube.indices_to_actions(actions)
			states = cube.multi_rotate(states, faces, dirs)
			states_oh = cube.as_oh(states)
			solved_states = cube.multi_is_solved(states)
			if np.any(solved_states):
				self._explored_states += (d+1) * self.workers
				w = np.where(solved_states)[0][0]
				return paths, None, None, (w, d+1)
			new_states[self._get_indices(d)] = states
			new_states_oh[self._get_indices(d)] = states_oh
		self._explored_states += len(new_states)

		return paths, new_states, new_states_oh, (-1, -1)
Example #2
0
 def onehot(self, n: int):
     self.log.section(
         f"Benchmarking {TickTock.thousand_seps(n)} one-hot encodings, {_repstr()}"
     )
     states = _get_states((n, ))
     pname = f"One-hot encoding single state, {_repstr()}"
     for state in states.squeeze():
         self.tt.profile(pname)
         cube.as_oh(state)
         self.tt.end_profile()
     self._log_method_results("Average state encoding time", pname)
Example #3
0
 def multi_onehot(self, n: int, n_states: int):
     self.log.section(
         f"Benchmarking {TickTock.thousand_seps(n)} one-hot encodings of "
         f"{TickTock.thousand_seps(n_states)} states each, {_repstr()}")
     all_states = _get_states((n, n_states))
     pname = f"One-hot encoding {TickTock.thousand_seps(n_states)} states, {_repstr()}"
     for states in all_states:
         self.tt.profile(pname)
         cube.as_oh(states)
         self.tt.end_profile()
     self._log_method_results("Average state encoding time", pname,
                              n_states)
Example #4
0
	def load(load_dir: str, logger=NullLogger(), load_best=False):
		"""
		Load a model from a configuration directory
		"""

		model_path = os.path.join(load_dir, "model.pt" if not load_best else "model-best.pt")
		conf_path = os.path.join(load_dir, "config.json")
		with open(conf_path, encoding="utf-8") as conf:
			try:
				state_dict = torch.load(model_path, map_location=gpu)
			except FileNotFoundError:
				model_path = os.path.join(load_dir, "model.pt")
				state_dict = torch.load(model_path, map_location=gpu)
			config = ModelConfig.from_json_dict(json.load(conf))

		model = Model.create(config, logger)
		model.load_state_dict(state_dict)
		model.to(gpu)
		# First time the net is loaded, a feedforward is performed, as the first time is slow
		# This avoids skewing evaluation results
		with torch.no_grad():
			model.eval()
			model(cube.as_oh(cube.get_solved()))
			model.train()
		return model
Example #5
0
	def search(self, state: np.ndarray, time_limit: float=None, max_states: int=None) -> bool:
		time_limit, max_states = self.reset(time_limit, max_states)
		self.tt.tick()

		self.indices[state.tostring()] = 1
		self.states[1] = state
		if cube.is_solved(state): return True

		oh = cube.as_oh(state)
		p, v = self.net(oh)
		self.P[1] = p.softmax(dim=1).cpu().numpy()
		self.V[1] = v.cpu().numpy()
		indices_visited = [1]
		actions_taken = []
		while self.tt.tock() < time_limit and len(self) + cube.action_dim <= max_states:
			self.tt.profile("Expanding leaves")
			solve_leaf_index, solve_action = self.expand_leaf(indices_visited, actions_taken)
			self.tt.end_profile("Expanding leaves")

			# If a solution is found
			if solve_leaf_index != -1:
				self.action_queue = deque(actions_taken) + deque([solve_action])
				if self.search_graph:
					self._complete_graph()
					self._shorten_action_queue(solve_leaf_index)
				return True

			# Find leaves
			indices_visited, actions_taken = self.find_leaf(time_limit)

		self.action_queue = deque(actions_taken)  # Generates a best guess action queue in case of no solution

		return False
Example #6
0
	def _step(self, state: np.ndarray) -> (int, np.ndarray, bool):
		substates = cube.multi_rotate(cube.repeat_state(state, cube.action_dim), *cube.iter_actions())
		solutions = cube.multi_is_solved(substates)
		if np.any(solutions):
			action = np.where(solutions)[0][0]
			return action, substates[action], True
		else:
			substates_oh = cube.as_oh(substates)
			v = self.net(substates_oh, policy=False).squeeze().cpu().numpy()
			action = np.argmax(v)
			return action, substates[action], False
Example #7
0
	def cost(self, states: np .ndarray, indeces: np.ndarray) -> np.ndarray:
		"""The A star cost of the state using the DNN heuristic
		Uses the value neural network. -value is regarded as the distance heuristic
		It is actually not really necessay to accept both the states and their indices, but
		it speeds things a bit up not having to calculate them here again.

		:param states: (batch size, *(cube_dimensions)) of states
		:param indeces: indeces in self.indeces corresponding to these states.
		"""
		states = cube.as_oh(states)
		H = -self.net(states, value=True, policy=False)
		H = H.cpu().squeeze().detach().numpy()

		return self.lambda_ * self.G[indeces] + H
Example #8
0
    def _mcts_test(self, state: np.ndarray, search_graph: bool):
        agent = MCTS(Model.create(ModelConfig()),
                     c=1,
                     search_graph=search_graph)
        solved = agent.search(state, .2)

        # Indices
        assert agent.indices[state.tostring()] == 1
        for s, i in agent.indices.items():
            assert agent.states[i].tostring() == s
        assert sorted(agent.indices.values())[0] == 1
        assert np.all(np.diff(sorted(agent.indices.values())) == 1)

        used_idcs = np.array(list(agent.indices.values()))

        # States
        assert np.all(agent.states[1] == state)
        for i, s in enumerate(agent.states):
            if i not in used_idcs: continue
            assert s.tostring() in agent.indices
            assert agent.indices[s.tostring()] == i

        # Neighbors
        if not search_graph:
            for i, neighs in enumerate(agent.neighbors):
                if i not in used_idcs: continue
                state = agent.states[i]
                for j, neighbor_index in enumerate(neighs):
                    assert neighbor_index == 0 or neighbor_index in agent.indices.values(
                    )
                    if neighbor_index == 0: continue
                    substate = cube.rotate(state, *cube.action_space[j])
                    assert np.all(agent.states[neighbor_index] == substate)

        # Policy and value
        with torch.no_grad():
            p, v = agent.net(cube.as_oh(agent.states[used_idcs]))
        p, v = p.softmax(dim=1).cpu().numpy(), v.squeeze().cpu().numpy()
        assert np.all(np.isclose(agent.P[used_idcs], p, atol=1e-5))
        assert np.all(np.isclose(agent.V[used_idcs], v, atol=1e-5))

        # Leaves
        if not search_graph:
            assert np.all(agent.neighbors.all(axis=1) != agent.leaves)

        # W
        assert agent.W[used_idcs].all()

        return agent, solved
Example #9
0
 def test_as_oh(self):
     state = cube.get_solved()
     oh = cube.as_oh(state)
     supposed_state = torch.zeros(20, 24, device=gpu)
     corners = [
         get_corner_pos(c, o)
         for c, o in zip(SimpleState.corners.tolist(),
                         SimpleState.corner_orientations.tolist())
     ]
     supposed_state[torch.arange(8), corners] = 1
     sides = [
         get_side_pos(s, o)
         for s, o in zip(SimpleState.sides.tolist(),
                         SimpleState.side_orientations.tolist())
     ]
     supposed_state[torch.arange(8, 20), sides] = 1
     assert (supposed_state.flatten() == oh).all()
Example #10
0
	def __init__(self,
				 evaluations: np.ndarray,
				 games: int,
				 depth: int,
				 extra_evals: int,
				 reward_method: str,
				 logger: Logger = NullLogger()):
		"""Initialize containers mostly

		:param np.ndarray evaluations:  array of the evaluations performed on the model. Used for the more intensive analysis
		:param int depth: Rollout depth
		:param extra_evals: If != 0, extra evaluations are added for the first `exta_evals` rollouts

		"""

		self.games = games
		self.depth = depth
		self.depths = np.arange(depth)
		self.extra_evals = min(evaluations[-1] if len(evaluations) else 0, extra_evals) #Wont add evals in the future (or if no evals are needed)
		self.evaluations = np.unique( np.append(evaluations, range( self.extra_evals )) )
		self.reward_method = reward_method

		self.orig_params = None
		self.params = None

		self.first_states = np.stack((
				cube.get_solved(),
				*cube.multi_rotate(cube.repeat_state(cube.get_solved(), cube.action_dim), *cube.iter_actions())
				))
		self.first_states = cube.as_oh( self.first_states )
		self.first_state_values = list()

		self.substate_val_stds = list()

		self.avg_value_targets = list()
		self.param_changes = list()
		self.param_total_changes = list()

		self.policy_entropies = list()
		self.rollout_policy = list()

		self.log = logger
		self.log.verbose(f"Analysis of this training was enabled. Extra analysis is done for evaluations and for first {extra_evals} rollouts")
Example #11
0
	def ADI_traindata(self, net, alpha: float):
		""" Training data generation

		Implements Autodidactic Iteration as per McAleer, Agostinelli, Shmakov and Baldi, "Solving the Rubik's Cube Without Human Knowledge" section 4.1
		Loss weighting is dependant on `self.loss_weighting`.

		:param torch.nn.Model net: The network used for generating the training data. This should according to ADI be the network from the last rollout.
		:param int rollout:  The current rollout number. Used in adaptive loss weighting.

		:return:  Games * sequence_length number of observations divided in four arrays
			- states contains the rubiks state for each data point
			- policy_targets and value_targets contains optimal value and policy targets for each training point
			- loss_weights contains the weight for each training point (see weighted samples subsection of McAleer et al paper)

		:rtype: (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor)

		"""
		net.eval()
		self.tt.profile("Scrambling")
		# Only include solved state in training if using Max Lapan convergence fix
		states, oh_states = cube.sequence_scrambler(self.rollout_games, self.rollout_depth, with_solved = self.reward_method == 'lapanfix')
		self.tt.end_profile("Scrambling")

		# Keeps track of solved states - Max Lapan's convergence fix
		solved_scrambled_states = cube.multi_is_solved(states)

		# Generates possible substates for all scrambled states. Shape: n_states*action_dim x *Cube_shape
		self.tt.profile("ADI substates")
		substates = cube.multi_rotate(np.repeat(states, cube.action_dim, axis=0), *cube.iter_actions(len(states)))
		self.tt.end_profile("ADI substates")
		self.tt.profile("One-hot encoding")
		substates_oh = cube.as_oh(substates)
		self.tt.end_profile("One-hot encoding")

		self.tt.profile("Reward")
		solved_substates = cube.multi_is_solved(substates)
		# Reward for won state is 1 normally but 0 if running with reward0
		rewards = (torch.zeros if self.reward_method == 'reward0' else torch.ones)\
			(*solved_substates.shape)
		rewards[~solved_substates] = -1
		self.tt.end_profile("Reward")

		# Generates policy and value targets
		self.tt.profile("ADI feedforward")
		while True:
			try:
				value_parts = [net(substates_oh[slice_], policy=False, value=True).squeeze() for slice_ in self._get_adi_ff_slices()]
				values = torch.cat(value_parts).cpu()
				break
			except RuntimeError as e:  # Usually caused by running out of vram. If not, the error is still raised, else batch size is reduced
				if "alloc" not in str(e):
					raise e
				self.log.verbose(f"Intercepted RuntimeError {e}\nIncreasing number of ADI feed forward batches from {self.adi_ff_batches} to {self.adi_ff_batches*2}")
				self.adi_ff_batches *= 2
		self.tt.end_profile("ADI feedforward")

		self.tt.profile("Calculating targets")
		values += rewards
		values = values.reshape(-1, 12)
		policy_targets = torch.argmax(values, dim=1)
		value_targets = values[np.arange(len(values)), policy_targets]
		if self.reward_method == 'lapanfix':
			# Trains on goal state, sets goalstate to 0
			value_targets[solved_scrambled_states] = 0
		elif self.reward_method == 'schultzfix':
			# Does not train on goal state, but sets first 12 substates to 0
			first_substates = np.zeros(len(states), dtype=bool)
			first_substates[np.arange(0, len(states), self.rollout_depth)] = True
			value_targets[first_substates] = 0

		self.tt.end_profile("Calculating targets")

		# Weighting examples according to alpha
		weighted = np.tile(1 / np.arange(1, self.rollout_depth+1), self.rollout_games)
		unweighted = np.ones_like(weighted)
		ws, us = weighted.sum(), len(unweighted)
		loss_weights = ((1-alpha) * weighted / ws + alpha * unweighted / us) * (ws + us)

		if self.with_analysis:
			self.tt.profile("ADI analysis")
			self.analysis.ADI(values)
			self.tt.end_profile("ADI analysis")
		return oh_states, policy_targets, value_targets, torch.from_numpy(loss_weights).float()
Example #12
0
	def expand_leaf(self, visited_states_idcs: list, actions_taken: list) -> (int, int):
		"""
		Expands around the given leaf and updates V and W in all visited_states_idcs
		Returns the action taken to solve the cube. -1 if no solution is found
		:param visited_states_idcs: List of states that have been visited including the starting node. Length n
		:param actions_taken: List of actions taken from starting state. Length n-1
		:return: The index of the leaf that is the solution and the action that must be taken from leaf_index.
			Both are 0 if solution is not found
		"""
		if len(self) + cube.action_dim > len(self.states):
			self.increase_stack_size()

		leaf_index = visited_states_idcs[-1]
		solve_leaf, solve_action = -1, -1

		self.tt.profile("Get substates")
		state = self.states[leaf_index]
		substates = cube.multi_rotate(cube.repeat_state(state), *cube.iter_actions())
		self.tt.end_profile("Get substates")

		# Check what states have been seen already
		substate_strs = [s.tostring() for s in substates]  # Unique identifier for each substate
		get_substate_strs = lambda bools: [s for s, b in zip(substate_strs, bools) if b]  # Shitty way to easily index into list with boolean array
		seen_substates = np.array([s in self.indices for s in substate_strs])  # States already in the graph
		unseen_substates = ~seen_substates  # States not already in the graph

		self.tt.profile("Update indices and states")
		new_states_idcs = len(self) + np.arange(unseen_substates.sum()) + 1
		new_idcs_dict = { s: i for i, s in zip(new_states_idcs, get_substate_strs(unseen_substates)) }
		self.indices.update(new_idcs_dict)
		substate_idcs = np.array([self.indices[s] for s in substate_strs])
		new_substate_idcs = substate_idcs[unseen_substates]
		new_substates = substates[unseen_substates]
		self.states[new_substate_idcs] = new_substates
		self.tt.end_profile("Update indices and states")

		self.tt.profile("Update neigbors and leaf status")
		actions = np.arange(cube.action_dim)
		self.neighbors[leaf_index, actions] = substate_idcs
		self.neighbors[substate_idcs, cube.rev_actions(actions)] = leaf_index
		self.leaves[leaf_index] = False
		self.tt.end_profile("Update neigbors and leaf status")

		self.tt.profile("Check for solution")
		solved_substate = np.where(cube.multi_is_solved(substates))[0]
		if solved_substate.size:
			solve_leaf = substate_idcs[solved_substate[0]]
			solve_action = solved_substate[0]
		self.tt.end_profile("Check for solution")

		# Update policy, value, and W
		self.tt.profile("One-hot encoding")
		new_substates_oh = cube.as_oh(new_substates)
		self.tt.end_profile("One-hot encoding")
		self.tt.profile("Feedforward")
		p, v = self.net(new_substates_oh)
		p, v = p.cpu().softmax(dim=1).numpy(), v.cpu().numpy().squeeze()
		self.tt.end_profile("Feedforward")

		self.tt.profile("Update P, V, and W")
		self.P[new_substate_idcs] = p
		self.V[new_substate_idcs] = v

		best_substate_v = v.max()
		self.W[leaf_index] = self.V[self.neighbors[leaf_index]]
		self.W[new_substate_idcs] = np.tile(v, (cube.action_dim, 1)).T
		self.W[visited_states_idcs[:-1], actions_taken] = np.maximum(self.W[visited_states_idcs[:-1], actions_taken], best_substate_v)
		self.tt.end_profile("Update P, V, and W")

		# Update N and L
		self.tt.profile("Update N and L")
		if actions_taken:  # Crashes if actions_taken is empty, which happens on the first run
			self.N[visited_states_idcs[:-1], actions_taken] += 1
			self.L[visited_states_idcs[:-1], actions_taken] = 0
			self.L[visited_states_idcs[1:], cube.rev_actions(np.array(actions_taken))] = 0
		self.tt.end_profile("Update N and L")

		return solve_leaf, solve_action
Example #13
0
	def _step(self, state: np.ndarray) -> (int, np.ndarray, bool):
		policy = torch.nn.functional.softmax(self.net(cube.as_oh(state), value=False).cpu(), dim=1).numpy().squeeze()
		action = np.random.choice(cube.action_dim, p=policy) if self.sample_policy else policy.argmax()
		state = cube.rotate(state, *cube.action_space[action])
		return action, state, cube.is_solved(state)