def simulateVanillaMCTS(model, state, depth, q, counts, seenStates, stateStr): if depth == 0: return 0 if stateStr not in seenStates: q[stateStr] = {} counts[stateStr] = {} for move in moves: nextState = py222.doAlgStr(state, move) nextStateArray = np.array([py222.getState(nextState).flatten()]) value, _ = model.predict(nextStateArray) q[stateStr][move] = value + reward(nextState) counts[stateStr][move] = 1 seenStates.add(stateStr) return rolloutVanillaMCTS(model, state, depth) totalStateCounts = 0 for move in moves: totalStateCounts += counts[stateStr][move] allQuantities = np.zeros(len(moves)) for i in range(len(moves)): allQuantities[i] = q[stateStr][ moves[i]] + constants.kMCTSExploration * math.sqrt( math.log(totalStateCounts) / counts[stateStr][moves[i]]) bestActionIndex = allQuantities.argmax() bestMove = moves[bestActionIndex] nextState = py222.doAlgStr(state, bestMove) r = reward(nextState) newQ = r + constants.kDiscountFactor * simulateVanillaMCTS( model, nextState, depth - 1, q, counts, seenStates, str(nextState)) counts[stateStr][bestMove] += 1 q[stateStr][bestMove] += ( newQ - q[stateStr][bestMove]) / counts[stateStr][bestMove] return newQ
def rolloutVanillaMCTS(model, cube, depth): if depth == 0: return 0 state = np.array([py222.getState(cube).flatten()]) _, policies = model.predict(state) actionIndex = selectActionSoftmax(policies) nextState = py222.doAlgStr(cube, moves[actionIndex]) r = reward(nextState) return r + constants.kDiscountFactor * rolloutVanillaMCTS( model, nextState, depth - 1)
def solveSingleCubeGreedy(model, cube, maxMoves): numMovesTaken = 0 while numMovesTaken <= maxMoves: if py222.isSolved(cube, convert=True): return True, numMovesTaken state = np.array([py222.getState(cube).flatten()]) _, policies = model.predict(state) policiesArray = policies[0] bestMove = policiesArray.argmax() cube = py222.doAlgStr(cube, moves[bestMove]) numMovesTaken += 1 return False, maxMoves + 1
def generateSamples(k, l): N = k * l samples = np.empty((N, constants.kNumStickers), dtype=bytes) states = np.empty((N, constants.kNumCubes * constants.kNumStickers)) for i in range(l): currentCube = py222.initState() for j in range(k): scrambledCube = py222.doAlgStr(currentCube, getRandomMove()) samples[k * i + j] = scrambledCube states[k * i + j] = py222.getState(scrambledCube).flatten() currentCube = scrambledCube return samples, coo_matrix(states)
def solveSingleCubeVanillaMCTS(model, cube, maxMoves, maxDepth): numMovesTaken = 0 q = {} counts = {} while numMovesTaken <= maxMoves: if py222.isSolved(cube, convert=True): return True, numMovesTaken bestMove = selectActionVanillaMCTS(model, cube, maxDepth, q, counts) if bestMove == -1: print("something went wrong when selecting best move") break cube = py222.doAlgStr(cube, moves[bestMove]) numMovesTaken += 1 return False, maxMoves + 1
def doADI(k, l, M): model = buildModel(constants.kNumStickers * constants.kNumCubes) compileModel(model, constants.kLearningRate) for iterNum in range(M): t0 = time.time() samples, _ = generateSamples(k, l) t1 = time.time() print(t1 - t0) states = np.empty( (len(samples), constants.kNumStickers * constants.kNumCubes)) optimalVals = np.empty((len(samples), 1)) optimalPolicies = np.empty(len(samples), dtype=np.int32) t0 = time.time() for i, sample in enumerate(samples): values = np.empty(len(moves)) for j, move in enumerate(moves): child = py222.doAlgStr(sample, move) childState = np.array([py222.getState(child).flatten()]) value, _ = model.predict(childState) value = value[0][0] values[j] = value + reward(child) optimalVals[i] = np.array([values.max()]) optimalPolicies[i] = values.argmax() states[i] = py222.getState(sample).flatten() t1 = time.time() print(t1 - t0) t0 = time.time() model.fit(states, { "PolicyOutput": optimalPolicies, "ValueOutput": optimalVals }, epochs=constants.kNumMaxEpochs, verbose=False, steps_per_epoch=1) t1 = time.time() print(t1 - t0) gc.collect() print(iterNum) return model
import py222 import solver import numpy as np # get solved state s = py222.initState() # apply some scramble s = py222.doAlgStr(s, "F2") # solve cube solver.solveCube(s)
# solve a cube state def solveCube(s): # print cube state py222.printCube(s) # FC-normalize stickers print("normalizing stickers...") s = py222.normFC(s) # generate pruning tables print("generating pruning tables...") genOTable(py222.initState(), 0) genPTable(py222.initState(), 0) # run IDA* print("searching...") solved = False depth = 1 while depth <= 11 and not solved: print("depth {}".format(depth)) solved = IDAStar(s, depth, []) depth += 1 if __name__ == "__main__": # input some scrambled state s = py222.doAlgStr(py222.initState(), "R U2 R2 F2 R' F2 R F R") # solve cube solveCube(s)
def solveSingleCubeFullMCTS(model, cube, maxMoves): numMovesTaken = 0 simulatedPath = [] simulatedActions = [] treeStates = set() seenStates = set() currentCube = cube currentCubeStr = str(cube) counts = {} maxVals = {} priorProbabilities = {} virtualLosses = {} state = np.array([py222.getState(currentCube).flatten()]) _, probs = model.predict(state) probsArray = probs[0] initStateVals(currentCubeStr, counts, maxVals, priorProbabilities, virtualLosses, probsArray) seenStates.add(currentCubeStr) simulatedPath.append(currentCube) while numMovesTaken <= maxMoves: if py222.isSolved(currentCube, convert=True): return True, numMovesTaken, simulatedPath if currentCubeStr not in treeStates: for move in moves: childState = py222.doAlgStr(currentCube, move) childStateStr = str(childState) if childStateStr not in seenStates: state = np.array([py222.getState(childState).flatten()]) _, probs = model.predict(state) probsArray = probs[0] initStateVals(childStateStr, counts, maxVals, priorProbabilities, virtualLosses, probsArray) seenStates.add(childStateStr) state = np.array([py222.getState(currentCube).flatten()]) value, _ = model.predict(state) value = value[0][0] for i, state in enumerate(simulatedPath): if i < len(simulatedActions): stateStr = str(state) maxVals[stateStr][simulatedActions[i]] = max( maxVals[stateStr][simulatedActions[i]], value) counts[stateStr][simulatedActions[i]] += 1 virtualLosses[stateStr][ simulatedActions[i]] -= constants.kVirtualLoss treeStates.add(currentCubeStr) else: actionVals = np.zeros(len(moves)) totalStateCounts = 0 for move in moves: totalStateCounts += counts[currentCubeStr][move] for i in range(len(moves)): currMove = moves[i] q = maxVals[currentCubeStr][currMove] - virtualLosses[ currentCubeStr][currMove] u = constants.kMCTSExploration * priorProbabilities[ currentCubeStr][currMove] * math.sqrt(totalStateCounts) / ( 1 + counts[currentCubeStr][currMove]) actionVals[i] = u + q bestMoveIndex = actionVals.argmax() bestMove = moves[bestMoveIndex] virtualLosses[currentCubeStr][bestMove] += constants.kVirtualLoss simulatedActions.append(bestMove) currentCube = py222.doAlgStr(currentCube, bestMove) currentCubeStr = str(currentCube) simulatedPath.append(currentCube) numMovesTaken += 1 return False, maxMoves + 1, simulatedPath
def executeAction(self, action): self.cube = py222.doAlgStr(self.cube, action)