def _get_states(shape: tuple): shape = (*shape, *cube.shape()) if len(shape) > 1 else (1, *shape, *cube.shape()) n, n_states = shape[0], shape[1] states = np.empty(shape, dtype=cube.dtype) states[0] = cube.repeat_state(cube.get_solved(), n_states) for i in range(1, len(states)): faces, dirs = np.random.randint(0, 6, n_states), np.random.randint( 0, 2, n_states) states[i] = cube.multi_rotate(states[i - 1], faces, dirs) return states
def expand(self, state: np.ndarray) -> (list, np.ndarray, torch.tensor, tuple): # Initialize needed data structures states = cube.repeat_state(state, self.workers) states_oh = cube.as_oh(states) paths = paths = np.empty((self.workers, self.depth), dtype=int) # Index n contains path for worker n new_states = np.empty((self.workers * self.depth, *cube.shape()), dtype=cube.dtype) new_states_oh = torch.empty(self.workers * self.depth, cube.get_oh_shape(), dtype=torch.float, device=gpu) # Expand for self.depth iterations for d in range(self.depth): # Use epsilon-greedy to decide where to use policy and random actions use_random = np.random.choice(2, self.workers, p=[1-self.epsilon, self.epsilon]).astype(bool) use_policy = ~use_random actions = np.empty(self.workers, dtype=int) # Random actions actions[use_random] = np.random.randint(0, cube.action_dim, use_random.sum()) # Policy actions p = self.net(states_oh[use_policy], value=False).cpu().numpy() actions[use_policy] = p.argmax(axis=1) # Update paths paths[:, d] = actions # Expand using selected actions faces, dirs = cube.indices_to_actions(actions) states = cube.multi_rotate(states, faces, dirs) states_oh = cube.as_oh(states) solved_states = cube.multi_is_solved(states) if np.any(solved_states): self._explored_states += (d+1) * self.workers w = np.where(solved_states)[0][0] return paths, None, None, (w, d+1) new_states[self._get_indices(d)] = states new_states_oh[self._get_indices(d)] = states_oh self._explored_states += len(new_states) return paths, new_states, new_states_oh, (-1, -1)
def increase_stack_size(self): expand_size = len(self.states) self.states = np.concatenate([self.states, np.empty((expand_size, *cube.shape()), dtype=cube.dtype)]) self.parents = np.concatenate([self.parents, np.zeros(expand_size, dtype=int)]) self.parent_actions = np.concatenate([self.parent_actions, np.zeros(expand_size, dtype=int)]) self.G = np.concatenate([self.G, np.empty(expand_size)])
def increase_stack_size(self): expand_size = len(self.states) self.states = np.concatenate([self.states, np.empty((expand_size, *cube.shape()), dtype=cube.dtype)]) self.neighbors = np.concatenate([self.neighbors, np.zeros((expand_size, cube.action_dim), dtype=int)]) self.leaves = np.concatenate([self.leaves, np.ones(expand_size, dtype=bool)]) self.P = np.concatenate([self.P, np.empty((expand_size, cube.action_dim))]) self.V = np.concatenate([self.V, np.empty(expand_size)]) self.N = np.concatenate([self.N, np.zeros((expand_size, cube.action_dim), dtype=int)]) self.W = np.concatenate([self.W, np.zeros((expand_size, cube.action_dim))]) self.L = np.concatenate([self.L, np.zeros((expand_size, cube.action_dim))])
def reset(self, time_limit: float, max_states: int) -> (float, int): time_limit, max_states = super().reset(time_limit, max_states) self.open_queue = list() self.indices = dict() self.states = np.empty((self._stack_expand, *cube.shape()), dtype=cube.dtype) self.parents = np.empty(self._stack_expand, dtype=int) self.parent_actions = np.zeros(self._stack_expand, dtype=int) self.G = np.empty(self._stack_expand) return time_limit, max_states
def reset(self, time_limit: float, max_states: int): time_limit, max_states = super().reset(time_limit, max_states) self.indices = dict() self.states = np.empty((self.expand_nodes, *cube.shape()), dtype=cube.dtype) self.neighbors = np.zeros((self.expand_nodes, cube.action_dim), dtype=int) self.leaves = np.ones(self.expand_nodes, dtype=bool) self.P = np.empty((self.expand_nodes, cube.action_dim)) self.V = np.empty(self.expand_nodes) self.N = np.zeros((self.expand_nodes, cube.action_dim), dtype=int) self.W = np.zeros((self.expand_nodes, cube.action_dim)) self.L = np.zeros((self.expand_nodes, cube.action_dim)) return time_limit, max_states