def update(self, transitionsBatch):
        """
        Update the Q-Values from the given batch of transitions
        :param transitionsBatch: List of tuples (qState, action, nextQState, reward, isStateFinal, list of legal actions)
        """

        trainingBatchQStates = []
        trainingBatchTargetQValues = []

        # Convert raw states to our q-states and calculate update policy for each transition in batch
        for aQState, anAction, aReward, aNextQState, isTerminal, nextStateLegalActions in transitionsBatch:

            # aReward = util.rescale(aReward, -510, 1000, -1, 1)

            actionsQValues = self.model.model.predict(np.array([aQState]))[0]
            targetQValues = actionsQValues.copy()

            # Update rule
            if isTerminal:
                updatedQValueForAction = aReward

            else:
                nextActionsQValues = self.model.model.predict(
                    np.array([aNextQState]))[0]
                nextStateLegalActionsIndices = [
                    Directions.getIndex(action)
                    for action in nextStateLegalActions
                ]

                try:
                    nextStateLegalActionsIndices.remove(4)
                except:
                    pass

                nextStateLegalActionsQValues = np.array(
                    nextActionsQValues)[nextStateLegalActionsIndices]
                maxNextActionQValue = max(nextStateLegalActionsQValues)
                updatedQValueForAction = (
                    aReward + self.trainingRoom.discount * maxNextActionQValue)

            targetQValues[Directions.getIndex(
                anAction)] = updatedQValueForAction

            trainingBatchQStates.append(aQState)
            trainingBatchTargetQValues.append(targetQValues)

        return self.model.model.train_on_batch(
            x=np.array(trainingBatchQStates),
            y=np.array(trainingBatchTargetQValues))
    def getAction(self, rawState, epsilon):

        legalActions = rawState.getLegalActions()
        legalActions.remove(Directions.STOP)

        if util.flipCoin(epsilon):
            return random.choice(legalActions)

        else:
            qValues = [(Directions.getIndex(action),
                        self.getQValue(rawState, action))
                       for action in legalActions]
            qValues = sorted(qValues, key=lambda x: x[1], reverse=True)

            for index, qValue in qValues:
                action = Directions.fromIndex(index)
                if action in legalActions:
                    return action
Exemple #3
0
def getGhostDirections(state):
    return np.array([Directions.getIndex(s.getDirection()) for s in state.getGhostStates()]) / 4.0
Exemple #4
0
 def remember(self, state, action, reward, nextState):
     from game import Directions
     self.replayMemory[str(state.__hash__()) +
                       str(Directions.getIndex(action))] = (state, action,
                                                            reward,
                                                            nextState)