コード例 #1
0
class Agent:
    def __init__(self, env, alphaActor, alphaCritic, lam, gamma, criticType, hiddenLayerSizes):
        self.__env = env
        self.__actor = Actor(alphaActor, lam, gamma)
        self.__criticType = criticType
        if self.__criticType == 0: # use criticTable
            from criticTable import CriticTable
            self.__critic = CriticTable(alphaCritic, lam, gamma)
        else: # use criticNN
            from criticNN import CriticNN
            state = env.getState()
            inputLayerSize = len(state)
            self.__critic = CriticNN(alphaCritic, lam, gamma, hiddenLayerSizes, inputLayerSize)

    # Actor-Critic learning
    def learn(self, runs, eps, epsDecay, verbose = False):
        pegsLeft = []
        iterationNumber = []
        if not verbose: # display progressbar instead
            from tqdm import tqdm
            runList = tqdm(range(runs))
        else:
            runList = range(runs)
        for i in runList: # for each episode
            self.__actor.resetEligibilities()
            self.__critic.resetEligibilities()
            state, validActions = self.__env.reset()
            if self.__criticType == 0: # only needed for table critic
                self.__critic.createEligibility(state)
                self.__critic.createStateValues(state)
            self.__actor.createSAPs(state, validActions)
            action = self.__actor.findNextAction(state, validActions, eps)
            self.__actor.updateEligibility(state, action)
            if len(validActions) == 0: break # do not run episode if initial state gives no valid moves
            while len(validActions) > 0: # while there exist a valid next move
                lastState, state, reinforcement, validActions = self.__env.execute(action)
                if self.__criticType == 0:
                    self.__critic.createEligibility(state)
                    self.__critic.createStateValues(state)
                self.__actor.createSAPs(state, validActions)
                action = self.__actor.findNextAction(state, validActions, eps)
                self.__actor.updateEligibility(state, action)
                td_error = self.__critic.findTDError(reinforcement, lastState, state)
                if self.__criticType == 0:
                    self.__critic.updateStateValues()
                else:
                    self.__critic.fit(reinforcement, lastState, state, td_error)
                self.__critic.updateEligibilities()
                self.__actor.updateSAPs(td_error)
                self.__actor.decayEligibilities()
            if verbose: # print valuation of each state
                print("ep", i,"  Pegs", self.__env.numberOfPegsLeft(), " LastState Value", "%.3f" % self.__critic.stateValue(lastState), " eps", "%.3f" % eps)
            pegsLeft.append(self.__env.numberOfPegsLeft())
            iterationNumber.append(i)
            eps = eps * epsDecay # decrease exploration
        plt.plot(iterationNumber, pegsLeft) # plot the development for each episode
        plt.show()

    # runs a greedy search through the best states and actions
    def runGreedy(self, animation_delay):
        state, validActions = self.__env.reset()
        self.__env.draw()
        action = self.__actor.findNextAction(state, validActions, 0)
        while len(validActions) > 0: # while there exist a valid next move
            self.__env.draw(animation_delay)
            _, state, _, validActions = self.__env.execute(action)
            self.__actor.createSAPs(state, validActions) # if game is not won, greedy run may encounter new states.
            action = self.__actor.findNextAction(state, validActions, 0)
        self.__env.draw()