def test_iter_actions(self): actions = np.array([ [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5] * 2, [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0] * 2, ], dtype=np.uint8) assert np.all(actions == cube.iter_actions(2))
def _step(self, state: np.ndarray) -> (int, np.ndarray, bool): substates = cube.multi_rotate(cube.repeat_state(state, cube.action_dim), *cube.iter_actions()) solutions = cube.multi_is_solved(substates) if np.any(solutions): action = np.where(solutions)[0][0] return action, substates[action], True else: substates_oh = cube.as_oh(substates) v = self.net(substates_oh, policy=False).squeeze().cpu().numpy() action = np.argmax(v) return action, substates[action], False
def _complete_graph(self): """ Ensures that the graph is complete by expanding around all leaves and updating neighbors """ self.tt.profile("Complete graph") leaves_idcs = np.where(self.leaves[:len(self)+1])[0][1:] actions_taken = np.tile(np.arange(cube.action_dim), len(leaves_idcs)) repeated_leaves_idcs = np.repeat(leaves_idcs, cube.action_dim) substates = cube.multi_rotate(self.states[repeated_leaves_idcs], *cube.iter_actions(len(leaves_idcs))) substate_strs = [s.tostring() for s in substates] substate_idcs = np.array([self.indices[s] if s in self.indices else 0 for s in substate_strs]) self.neighbors[repeated_leaves_idcs, actions_taken] = substate_idcs self.neighbors[substate_idcs, cube.rev_actions(actions_taken)] = repeated_leaves_idcs self.neighbors[0] = 0 self.tt.end_profile("Complete graph")
def __init__(self, evaluations: np.ndarray, games: int, depth: int, extra_evals: int, reward_method: str, logger: Logger = NullLogger()): """Initialize containers mostly :param np.ndarray evaluations: array of the evaluations performed on the model. Used for the more intensive analysis :param int depth: Rollout depth :param extra_evals: If != 0, extra evaluations are added for the first `exta_evals` rollouts """ self.games = games self.depth = depth self.depths = np.arange(depth) self.extra_evals = min(evaluations[-1] if len(evaluations) else 0, extra_evals) #Wont add evals in the future (or if no evals are needed) self.evaluations = np.unique( np.append(evaluations, range( self.extra_evals )) ) self.reward_method = reward_method self.orig_params = None self.params = None self.first_states = np.stack(( cube.get_solved(), *cube.multi_rotate(cube.repeat_state(cube.get_solved(), cube.action_dim), *cube.iter_actions()) )) self.first_states = cube.as_oh( self.first_states ) self.first_state_values = list() self.substate_val_stds = list() self.avg_value_targets = list() self.param_changes = list() self.param_total_changes = list() self.policy_entropies = list() self.rollout_policy = list() self.log = logger self.log.verbose(f"Analysis of this training was enabled. Extra analysis is done for evaluations and for first {extra_evals} rollouts")
def ADI_traindata(self, net, alpha: float): """ Training data generation Implements Autodidactic Iteration as per McAleer, Agostinelli, Shmakov and Baldi, "Solving the Rubik's Cube Without Human Knowledge" section 4.1 Loss weighting is dependant on `self.loss_weighting`. :param torch.nn.Model net: The network used for generating the training data. This should according to ADI be the network from the last rollout. :param int rollout: The current rollout number. Used in adaptive loss weighting. :return: Games * sequence_length number of observations divided in four arrays - states contains the rubiks state for each data point - policy_targets and value_targets contains optimal value and policy targets for each training point - loss_weights contains the weight for each training point (see weighted samples subsection of McAleer et al paper) :rtype: (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor) """ net.eval() self.tt.profile("Scrambling") # Only include solved state in training if using Max Lapan convergence fix states, oh_states = cube.sequence_scrambler(self.rollout_games, self.rollout_depth, with_solved = self.reward_method == 'lapanfix') self.tt.end_profile("Scrambling") # Keeps track of solved states - Max Lapan's convergence fix solved_scrambled_states = cube.multi_is_solved(states) # Generates possible substates for all scrambled states. Shape: n_states*action_dim x *Cube_shape self.tt.profile("ADI substates") substates = cube.multi_rotate(np.repeat(states, cube.action_dim, axis=0), *cube.iter_actions(len(states))) self.tt.end_profile("ADI substates") self.tt.profile("One-hot encoding") substates_oh = cube.as_oh(substates) self.tt.end_profile("One-hot encoding") self.tt.profile("Reward") solved_substates = cube.multi_is_solved(substates) # Reward for won state is 1 normally but 0 if running with reward0 rewards = (torch.zeros if self.reward_method == 'reward0' else torch.ones)\ (*solved_substates.shape) rewards[~solved_substates] = -1 self.tt.end_profile("Reward") # Generates policy and value targets self.tt.profile("ADI feedforward") while True: try: value_parts = [net(substates_oh[slice_], policy=False, value=True).squeeze() for slice_ in self._get_adi_ff_slices()] values = torch.cat(value_parts).cpu() break except RuntimeError as e: # Usually caused by running out of vram. If not, the error is still raised, else batch size is reduced if "alloc" not in str(e): raise e self.log.verbose(f"Intercepted RuntimeError {e}\nIncreasing number of ADI feed forward batches from {self.adi_ff_batches} to {self.adi_ff_batches*2}") self.adi_ff_batches *= 2 self.tt.end_profile("ADI feedforward") self.tt.profile("Calculating targets") values += rewards values = values.reshape(-1, 12) policy_targets = torch.argmax(values, dim=1) value_targets = values[np.arange(len(values)), policy_targets] if self.reward_method == 'lapanfix': # Trains on goal state, sets goalstate to 0 value_targets[solved_scrambled_states] = 0 elif self.reward_method == 'schultzfix': # Does not train on goal state, but sets first 12 substates to 0 first_substates = np.zeros(len(states), dtype=bool) first_substates[np.arange(0, len(states), self.rollout_depth)] = True value_targets[first_substates] = 0 self.tt.end_profile("Calculating targets") # Weighting examples according to alpha weighted = np.tile(1 / np.arange(1, self.rollout_depth+1), self.rollout_games) unweighted = np.ones_like(weighted) ws, us = weighted.sum(), len(unweighted) loss_weights = ((1-alpha) * weighted / ws + alpha * unweighted / us) * (ws + us) if self.with_analysis: self.tt.profile("ADI analysis") self.analysis.ADI(values) self.tt.end_profile("ADI analysis") return oh_states, policy_targets, value_targets, torch.from_numpy(loss_weights).float()
def expand_leaf(self, visited_states_idcs: list, actions_taken: list) -> (int, int): """ Expands around the given leaf and updates V and W in all visited_states_idcs Returns the action taken to solve the cube. -1 if no solution is found :param visited_states_idcs: List of states that have been visited including the starting node. Length n :param actions_taken: List of actions taken from starting state. Length n-1 :return: The index of the leaf that is the solution and the action that must be taken from leaf_index. Both are 0 if solution is not found """ if len(self) + cube.action_dim > len(self.states): self.increase_stack_size() leaf_index = visited_states_idcs[-1] solve_leaf, solve_action = -1, -1 self.tt.profile("Get substates") state = self.states[leaf_index] substates = cube.multi_rotate(cube.repeat_state(state), *cube.iter_actions()) self.tt.end_profile("Get substates") # Check what states have been seen already substate_strs = [s.tostring() for s in substates] # Unique identifier for each substate get_substate_strs = lambda bools: [s for s, b in zip(substate_strs, bools) if b] # Shitty way to easily index into list with boolean array seen_substates = np.array([s in self.indices for s in substate_strs]) # States already in the graph unseen_substates = ~seen_substates # States not already in the graph self.tt.profile("Update indices and states") new_states_idcs = len(self) + np.arange(unseen_substates.sum()) + 1 new_idcs_dict = { s: i for i, s in zip(new_states_idcs, get_substate_strs(unseen_substates)) } self.indices.update(new_idcs_dict) substate_idcs = np.array([self.indices[s] for s in substate_strs]) new_substate_idcs = substate_idcs[unseen_substates] new_substates = substates[unseen_substates] self.states[new_substate_idcs] = new_substates self.tt.end_profile("Update indices and states") self.tt.profile("Update neigbors and leaf status") actions = np.arange(cube.action_dim) self.neighbors[leaf_index, actions] = substate_idcs self.neighbors[substate_idcs, cube.rev_actions(actions)] = leaf_index self.leaves[leaf_index] = False self.tt.end_profile("Update neigbors and leaf status") self.tt.profile("Check for solution") solved_substate = np.where(cube.multi_is_solved(substates))[0] if solved_substate.size: solve_leaf = substate_idcs[solved_substate[0]] solve_action = solved_substate[0] self.tt.end_profile("Check for solution") # Update policy, value, and W self.tt.profile("One-hot encoding") new_substates_oh = cube.as_oh(new_substates) self.tt.end_profile("One-hot encoding") self.tt.profile("Feedforward") p, v = self.net(new_substates_oh) p, v = p.cpu().softmax(dim=1).numpy(), v.cpu().numpy().squeeze() self.tt.end_profile("Feedforward") self.tt.profile("Update P, V, and W") self.P[new_substate_idcs] = p self.V[new_substate_idcs] = v best_substate_v = v.max() self.W[leaf_index] = self.V[self.neighbors[leaf_index]] self.W[new_substate_idcs] = np.tile(v, (cube.action_dim, 1)).T self.W[visited_states_idcs[:-1], actions_taken] = np.maximum(self.W[visited_states_idcs[:-1], actions_taken], best_substate_v) self.tt.end_profile("Update P, V, and W") # Update N and L self.tt.profile("Update N and L") if actions_taken: # Crashes if actions_taken is empty, which happens on the first run self.N[visited_states_idcs[:-1], actions_taken] += 1 self.L[visited_states_idcs[:-1], actions_taken] = 0 self.L[visited_states_idcs[1:], cube.rev_actions(np.array(actions_taken))] = 0 self.tt.end_profile("Update N and L") return solve_leaf, solve_action
def expand_batch(self, expand_idcs: np.ndarray) -> bool: """ Expands to the neighbors of each of the states in Loose pseudo code: ``` 1. Calculate children for all the batched expansion states 2. Check which children are seen and not seen 3. FOR the unseen IF they are the goal state: RETURN TRUE Set the state as their parent and set their G Calculate their H and add to open-list with correct cost 4. RELAX(seen) #See psudeo code under `relax_seen_states` 5. RETURN FALSE ``` :param expand_idcs: Indices corresponding to states in `self.states` of states from which to expand :return: True iff. solution was found in this expansion """ expand_size = len(expand_idcs) while len(self) + expand_size * cube.action_dim > len(self.states): self.increase_stack_size() self.tt.profile("Calculate substates") parent_idcs = np.repeat(expand_idcs, cube.action_dim, axis=0) substates = cube.multi_rotate( self.states[parent_idcs], *cube.iter_actions(expand_size) ) actions_taken = np.tile(np.arange(cube.action_dim), expand_size) self.tt.end_profile("Calculate substates") self.tt.profile("Find new substates") substate_strs = [s.tostring() for s in substates] get_substate_strs = lambda bools: [s for s, b in zip(substate_strs, bools) if b] seen_substates = np.array([s in self.indices for s in substate_strs]) unseen_substates = ~seen_substates # Handle duplicates first_occurences = np.zeros(len(substate_strs), dtype=bool) _, first_indeces = np.unique(substate_strs, return_index=True) first_occurences[first_indeces] = True first_seen = first_occurences & seen_substates first_unseen = first_occurences & unseen_substates self.tt.end_profile("Find new substates") self.tt.profile("Add substates to data structure") new_states = substates[first_unseen] new_states_idcs = len(self) + np.arange(first_unseen.sum()) + 1 new_idcs_dict = { s: i for i, s in zip(new_states_idcs, get_substate_strs(first_unseen)) } self.indices.update(new_idcs_dict) substate_idcs = np.array([self.indices[s] for s in substate_strs]) old_states_idcs = substate_idcs[first_seen] self.states[new_states_idcs] = substates[first_unseen] self.tt.end_profile("Add substates to data structure") self.tt.profile("Update new state values") new_parent_idcs = parent_idcs[first_unseen] self.G[new_states_idcs] = self.G[new_parent_idcs] + 1 self.parent_actions[new_states_idcs] = actions_taken[first_unseen] self.parents[new_states_idcs] = new_parent_idcs # Add the new states to "open" priority queue costs = self.cost(new_states, new_states_idcs) for i, cost in enumerate(costs): heapq.heappush(self.open_queue, (cost, new_states_idcs[i])) self.tt.end_profile("Update new state values") self.tt.profile("Check whether won") solved_substates = cube.multi_is_solved(new_states) if solved_substates.any(): return True self.tt.end_profile("Check whether won") self.tt.profile("Old states: Update parents and G") seen_batch_idcs = np.where(first_seen) #Old idcs corresponding to first_seen self.relax_seen_states( old_states_idcs, parent_idcs[seen_batch_idcs], actions_taken[seen_batch_idcs] ) self.tt.end_profile("Old states: Update parents and G") return False