コード例 #1
0
ファイル: prioritizedRL.py プロジェクト: gchhor/DeepRL
class Tester:
    def __init__(self):
        self.totalState = 10
        self.totalAction = 2
        self.epsilon = 0.1
        self.stepSize = 0.25
        #self.stepSize = 0.05
        self.discount = 1.0 - 1.0 / self.totalState
        self.maxIter = 10**6
        self.repeatNo = 10
        self.minibatch = 5
        self.alpha = 0.7
        self.beta = 0.5
        self.replayMemory = []
        self.maxWeight = 0

        self.mode = 'FA'
        #self.mode = 'Tablar'

        #self.samplePolicy = 'uniform'
        #self.samplePolicy = 'maxPriority'
        self.samplePolicy = 'rank'
        self.binaryHeap = BinaryHeap()

        print 'replay memory size : %s' % (2**(self.totalState + 1) - 2)

    def initialize(self):
        self.Qval = np.random.normal(
            0, 0.1, (self.totalState, self.totalAction)).astype(np.float32)
        self.params = np.random.normal(
            0, 0.1,
            (self.totalState * self.totalAction + 1)).astype(np.float32)
        self.wrongActions = np.zeros((self.totalState), dtype=np.int8)
        for i in range(self.totalAction):
            if i % 2 == 0:
                self.wrongActions[i] = 1
        self.replayMemory = []
        self.generateReplay()

    def generateReplay(self):
        for s in range(self.totalState - 1, -1, -1):
            repeat = 2**(self.totalState - s - 1)
            for r in range(repeat):
                a = self.getTrueAction(s)
                s2, r = self.doAction(s, a)
                self.replayMemory.append((s, a, r, s2))

                a = self.getWrongAction(s)
                s2, r = self.doAction(s, a)
                self.replayMemory.append((s, a, r, s2))
        random.shuffle(self.replayMemory)

        # Generate binary heap
        rank = 1
        rankSum = 0
        for data in self.replayMemory:
            self.binaryHeap.add(data, 1.0)
            rankSum += (1.0 / rank)**self.alpha
            rank += 1

        self.rankIndex = []
        segment = rankSum / self.minibatch
        if self.samplePolicy == 'rank':
            rank = 1
            segmentRankSum = 0
            segmentIndex = 0
            for i in range(1, len(self.replayMemory)):
                segmentRankSum += (1.0 / rank)**self.alpha
                rank += 1
                if segmentRankSum >= segment:
                    self.rankIndex.append(i)
                    segmentIndex += 1
                    segmentRankSum = 0
            self.rankIndex.append(len(self.replayMemory) - 1)

    def getAction(self, s):
        if random.random() < self.epsilon:
            return random.randint(0, 1)
        else:
            return np.argmax(self.getQval(s))

    def getFeatures(self, s, a):
        features = np.zeros((self.totalState * self.totalAction + 1),
                            dtype=np.int)
        features[s * self.totalAction + a] = 1
        features[self.totalState * self.totalAction] = 1  # bias
        return features

    def getQval(self, s, a=None):
        if self.mode == 'FA':
            if a == None:
                values = []
                values.append(self.params.dot(self.getFeatures(s, 0)))
                values.append(self.params.dot(self.getFeatures(s, 1)))
                return values
            else:
                return self.params.dot(self.getFeatures(s, a))
        else:
            if a == None:
                return self.Qval[s]
            else:
                return self.Qval[s, a]

    def isWrongAction(self, s, a):
        if self.wrongActions[s] == a:
            return True
        else:
            return False

    def getTrueAction(self, s):
        return 1 - self.wrongActions[s]

    def getWrongAction(self, s):
        return self.wrongActions[s]

    def doAction(self, s, a):
        """ returns next state and reward """

        if self.isWrongAction(s, a):
            return 0, 0
        else:
            if s < self.totalState - 1:
                return s + 1, 0
            else:
                return 0, 1

    def updateValue(self, s, a, td):
        if self.mode == 'FA':
            self.params += self.stepSize * td * self.getFeatures(s, a)
        else:
            self.Qval[s, a] = self.getQval(s, a) + self.stepSize * td

        if self.samplePolicy == 'maxPriority':
            self.binaryHeap.reorderTop(np.abs(td))

    def isComplete(self):
        R = 1.0
        error = 0
        for s in range(self.totalState - 1, -1, -1):
            for a in range(self.totalAction):
                estimate = self.getQval(s, a)
                if self.isWrongAction(s, a):
                    groundTruth = 0
                else:
                    groundTruth = R
                error += np.square(groundTruth - estimate)
            R *= self.discount

        if error / (self.totalState * self.totalAction) <= 10**-3:
            return True
        else:
            return False

    def sampleReplay(self, segment):
        if self.samplePolicy == 'uniform':
            index = random.randint(0, len(self.replayMemory) - 1)
            return index, 1.0, self.replayMemory[index]
        elif self.samplePolicy == 'maxPriority':
            index = 1
            item = self.binaryHeap.getTop()
            return index, 1.0, item[0]
        elif self.samplePolicy == 'rank':
            if segment == 0:
                index1 = 1
            else:
                index1 = self.rankIndex[segment - 1] + 1
            index2 = self.rankIndex[segment]
            index = random.randint(index1, index2)
            item = self.binaryHeap.heap[index]
            weight = (1.0 / index / self.totalState)**self.beta

            if weight > self.maxWeight:
                self.maxWeight = weight

            weight = weight / self.maxWeight
            # DJDJ
            #weight = 1.0
            return index, weight, item[0]

    def gogoReplay(self):
        print 'Training replay : policy %s' % self.samplePolicy

        startTime = time.time()
        trainDone = []
        paramSum = np.zeros((self.totalState * self.totalAction + 1))
        for repeat in range(self.repeatNo):
            self.initialize()

            for i in range(self.maxIter):
                paramSum.fill(0)
                for m in range(self.minibatch):
                    index, weight, (s, a, r, s2) = self.sampleReplay(m)
                    if s2 == 0:  # terminal state
                        td = r - self.getQval(s, a)
                    else:
                        td = r + self.discount * np.max(
                            self.getQval(s2)) - self.getQval(s, a)

                    if self.mode == 'FA':
                        paramSum += self.stepSize * weight * td * self.getFeatures(
                            s, a)
                    else:
                        self.Qval[s,
                                  a] = self.getQval(s, a) + self.stepSize * td

                    self.binaryHeap.reorder(index, np.abs(td))

                self.params += paramSum

                if i % 10 == 0:
                    if self.isComplete():
                        print 'training done %s out of %s' % (repeat + 1,
                                                              self.repeatNo)
                        trainDone.append(i)
                        break

        print '%s' % trainDone
        print '%s state training complete with %s mean iters = %.0f' % (
            self.totalState, self.mode, np.mean(trainDone))
        print 'elapsed : %.1fs' % (time.time() - startTime)

    def gogoOnline(self):
        print 'Training online'

        trainDone = []
        for repeat in range(self.repeatNo):
            s = 0
            self.initialize()

            for i in range(self.maxIter):
                a = self.getAction(s)
                s2, r = self.doAction(s, a)
                if s2 == 0:  # terminal state
                    td = r - self.getQval(s, a)
                else:
                    td = r + self.discount * np.max(
                        self.getQval(s2)) - self.getQval(s, a)
                self.updateValue(s, a, td)
                s = s2

                if i % 10 == 0:
                    if self.isComplete():
                        #print 'training done %s out of %s' % (repeat+1, self.repeatNo)
                        trainDone.append(i)
                        break

        print '%s' % trainDone
        print '%s state training complete with %s mean iters = %.0f' % (
            self.totalState, self.mode, np.mean(trainDone))
コード例 #2
0
ファイル: prioritizedRL.py プロジェクト: only4hj/DeepRL
class Tester:
    def __init__(self):
        self.totalState = 10
        self.totalAction = 2
        self.epsilon = 0.1 
        self.stepSize = 0.25
        #self.stepSize = 0.05
        self.discount = 1.0 - 1.0 / self.totalState
        self.maxIter = 10**6
        self.repeatNo = 10
        self.minibatch = 5
        self.alpha = 0.7
        self.beta = 0.5
        self.replayMemory = []
        self.maxWeight = 0
        
        self.mode = 'FA'
        #self.mode = 'Tablar'
        
        #self.samplePolicy = 'uniform'
        #self.samplePolicy = 'maxPriority'
        self.samplePolicy = 'rank'
        self.binaryHeap = BinaryHeap()
        
        print 'replay memory size : %s' % (2**(self.totalState + 1) - 2)
        
    def initialize(self):
        self.Qval = np.random.normal(0, 0.1, (self.totalState, self.totalAction)).astype(np.float32)
        self.params = np.random.normal(0, 0.1, (self.totalState * self.totalAction + 1)).astype(np.float32)
        self.wrongActions = np.zeros((self.totalState), dtype=np.int8)
        for i in range(self.totalAction):
            if i % 2 == 0:
                self.wrongActions[i] = 1
        self.replayMemory = []
        self.generateReplay()
    
    def generateReplay(self):
        for s in range(self.totalState-1, -1, -1):
            repeat = 2 ** (self.totalState - s - 1)
            for r in range(repeat):
                a = self.getTrueAction(s)
                s2, r = self.doAction(s, a)
                self.replayMemory.append((s, a, r, s2))

                a = self.getWrongAction(s)
                s2, r = self.doAction(s, a)
                self.replayMemory.append((s, a, r, s2))
        random.shuffle(self.replayMemory)

        # Generate binary heap
        rank = 1
        rankSum = 0
        for data in self.replayMemory:
            self.binaryHeap.add(data, 1.0)
            rankSum += (1.0 / rank) ** self.alpha
            rank += 1
        
        self.rankIndex = []
        segment = rankSum / self.minibatch 
        if self.samplePolicy == 'rank':
            rank = 1
            segmentRankSum = 0
            segmentIndex = 0
            for i in range(1, len(self.replayMemory)):
                segmentRankSum += (1.0 / rank) ** self.alpha
                rank += 1
                if segmentRankSum >= segment:
                    self.rankIndex.append(i)
                    segmentIndex += 1
                    segmentRankSum = 0
            self.rankIndex.append(len(self.replayMemory) - 1)
        
    def getAction(self, s):
        if random.random() < self.epsilon:
            return random.randint(0, 1) 
        else:
            return np.argmax(self.getQval(s))
    
    def getFeatures(self, s, a):
        features = np.zeros((self.totalState * self.totalAction + 1), dtype=np.int)
        features[s * self.totalAction + a] = 1                      
        features[self.totalState * self.totalAction] = 1       # bias
        return features
        
    def getQval(self, s, a=None):
        if self.mode == 'FA':
            if a == None:
                values = []
                values.append(self.params.dot(self.getFeatures(s, 0)))
                values.append(self.params.dot(self.getFeatures(s, 1)))
                return values
            else:
                return self.params.dot(self.getFeatures(s, a))
        else:
            if a == None:
                return self.Qval[s]
            else:
                return self.Qval[s, a]
        
    def isWrongAction(self, s, a):
        if self.wrongActions[s] == a:
            return True
        else:
            return False
        
    def getTrueAction(self, s):
        return 1 - self.wrongActions[s]
        
    def getWrongAction(self, s):
        return self.wrongActions[s]
        
    def doAction(self, s, a):
        """ returns next state and reward """
         
        if self.isWrongAction(s, a):
            return 0, 0
        else:
            if s < self.totalState - 1:
                return s+1, 0
            else:
                return 0, 1

    def updateValue(self, s, a, td):
        if self.mode == 'FA':
            self.params += self.stepSize * td * self.getFeatures(s, a)
        else:
            self.Qval[s, a] = self.getQval(s, a) + self.stepSize * td

        if self.samplePolicy == 'maxPriority':
            self.binaryHeap.reorderTop(np.abs(td))
            
    def isComplete(self):
        R = 1.0
        error = 0
        for s in range(self.totalState-1, -1, -1):
            for a in range(self.totalAction):
                estimate = self.getQval(s, a)
                if self.isWrongAction(s, a):
                    groundTruth = 0
                else:
                    groundTruth = R
                error += np.square(groundTruth - estimate)
            R *= self.discount
            
        if error / (self.totalState * self.totalAction) <= 10**-3:
            return True
        else:
            return False

    def sampleReplay(self, segment):
        if self.samplePolicy == 'uniform':
            index = random.randint(0, len(self.replayMemory)-1)
            return index, 1.0, self.replayMemory[index]
        elif self.samplePolicy == 'maxPriority':
            index = 1
            item = self.binaryHeap.getTop()
            return index, 1.0, item[0]
        elif self.samplePolicy == 'rank':
            if segment == 0:
                index1 = 1
            else:
                index1 = self.rankIndex[segment-1] + 1
            index2 = self.rankIndex[segment]
            index = random.randint(index1, index2)
            item = self.binaryHeap.heap[index]
            weight = (1.0 / index / self.totalState) ** self.beta
            
            if weight > self.maxWeight:
                self.maxWeight = weight
            
            weight = weight / self.maxWeight
            # DJDJ
            #weight = 1.0
            return index, weight, item[0]

    def gogoReplay(self):
        print 'Training replay : policy %s' % self.samplePolicy

        startTime = time.time()
        trainDone = []
        paramSum = np.zeros((self.totalState * self.totalAction + 1))
        for repeat in range(self.repeatNo):
            self.initialize()
            
            for i in range(self.maxIter):
                paramSum.fill(0)
                for m in range(self.minibatch):
                    index, weight, (s, a, r, s2) = self.sampleReplay(m)
                    if s2 == 0:     # terminal state
                        td = r - self.getQval(s, a)
                    else:
                        td = r + self.discount * np.max(self.getQval(s2)) - self.getQval(s, a)

                    if self.mode == 'FA':
                        paramSum += self.stepSize * weight * td * self.getFeatures(s, a)
                    else:
                        self.Qval[s, a] = self.getQval(s, a) + self.stepSize * td
            
                    self.binaryHeap.reorder(index, np.abs(td))

                self.params += paramSum

                if i % 10 == 0:
                    if self.isComplete():
                        print 'training done %s out of %s' % (repeat+1, self.repeatNo)
                        trainDone.append(i)
                        break
        
        print '%s' % trainDone
        print '%s state training complete with %s mean iters = %.0f' % (self.totalState, self.mode, np.mean(trainDone))
        print 'elapsed : %.1fs' % (time.time() - startTime)
        
    def gogoOnline(self):
        print 'Training online'

        trainDone = []
        for repeat in range(self.repeatNo):
            s = 0
            self.initialize()
            
            for i in range(self.maxIter):
                a = self.getAction(s)
                s2, r = self.doAction(s, a)
                if s2 == 0:     # terminal state
                    td = r - self.getQval(s, a)
                else:
                    td = r + self.discount * np.max(self.getQval(s2)) - self.getQval(s, a)
                self.updateValue(s, a, td)
                s = s2
                
                if i % 10 == 0:
                    if self.isComplete():
                        #print 'training done %s out of %s' % (repeat+1, self.repeatNo)
                        trainDone.append(i)
                        break
        
        print '%s' % trainDone
        print '%s state training complete with %s mean iters = %.0f' % (self.totalState, self.mode, np.mean(trainDone))