a = np.clip(np.random.normal(a, 0.3), -A_BOUND, A_BOUND) s_, r, done, _ = env.step(a) if done: end = 0 else: end = 1 scaled_r = r / 10 memory.save(s, a, scaled_r, s_, end) if (write_pointer % 10 == 0): summary = sess.run(merged) writer.add_summary(summary, write_pointer) if (memory.get_size() > HORIZON): bs, ba, br, bs_, be = memory.sample(BATCH_SIZE) agent.train(bs, ba, br, bs_, be) s = s_ ep_reward += r step_pointer += 1 if done: break recent100[episode % 100] = ep_reward average_reward = recent100.mean() print('EP: %i ' % episode, 'R: %.2f ' % ep_reward, 'Avg: %.2f ' % average_reward, 'STEP: %i ' % step_pointer) if ((average_reward >= TERMINATE_REWARD) and (episode > 100)):
class Agent: # RL variables: UP = ( 0, -1 ) # actions and states are defined as tupels (not as lists), so they can be used as dict keys DOWN = (0, 1) LEFT = (-1, 0) RIGHT = (1, 0) ACTIONSPACE = [UP, DOWN, LEFT, RIGHT] # for iteration purposes # Flow control variables to pass to an external GUI: UPDATED_BY_PLANNING = "Planning Update" UPDATED_BY_EXPERIENCE = "Experience Update" TOOK_ACTION = "Action Taken" FINISHED_EPISODE = "Episode Finish" STARTED_EPISODE = "Episode Start" OPERATIONS = [ UPDATED_BY_PLANNING, UPDATED_BY_EXPERIENCE, TOOK_ACTION, FINISHED_EPISODE, STARTED_EPISODE ] # for iteration purposes def __init__(self, environment, learningRateVar, dynamicAlphaVar, discountVar, nStepVar, nPlanVar, onPolicyVar, updateByExpectationVar, behaviorEpsilonVar, behaviorEpsilonDecayRateVar, targetEpsilonVar, targetEpsilonDecayRateVar, initialActionvalueMean=0, initialActionvalueSigma=0, predefinedAlgorithm=None, actionPlan=[]): self.environment = environment if predefinedAlgorithm: # TODO: set missing params accordingly pass self.learningRateVar = learningRateVar self.dynamicAlphaVar = dynamicAlphaVar self.discountVar = discountVar self.behaviorPolicy = EpsilonGreedyPolicy(self, behaviorEpsilonVar, behaviorEpsilonDecayRateVar) self.targetPolicy = EpsilonGreedyPolicy(self, targetEpsilonVar, targetEpsilonDecayRateVar) self.onPolicyVar = onPolicyVar self.updateByExpectationVar = updateByExpectationVar self.nStepVar = nStepVar self.nPlanVar = nPlanVar self.initialActionvalueMean = initialActionvalueMean # TODO: Set this in GUI self.initialActionvalueSigma = initialActionvalueSigma # TODO: Set this in GUI self.Qvalues = np.empty_like(self.environment.get_grid()) self.greedyActions = np.empty_like(self.environment.get_grid()) self.initialize_Qvalues() self.stateActionPairCounts = np.empty_like(self.environment.get_grid()) self.initialize_stateActionPairCounts() # Strictly speaking, the agent has no model at all and therefore in the beginning knows nothing about the environment, including its shape. # But to avoid technical details in implementation that would anyway not change the Agent behavior at all, # the agent will be given that the states can be structured in a matrix that has the same shape as the environment # and that the actionspace is constant for all possible states. self.episodicTask = None # TODO: not used so far self.state = None self.episodeFinished = False self.return_ = None # underscore to avoid naming conflict with return keyword self.episodeReturns = [] self.memory = Memory(self) self.hasChosenExploratoryMove = None self.hasMadeExploratoryMove = None self.targetAction = None self.targetActionvalue = None self.iSuccessivePlannings = None # Debug variables: self.actionPlan = actionPlan self.actionHistory = [] def initialize_Qvalues(self): for x in range(self.Qvalues.shape[0]): for y in range(self.Qvalues.shape[1]): self.Qvalues[x, y] = { action: np.random.normal(self.initialActionvalueMean, self.initialActionvalueSigma) for action in self.ACTIONSPACE } self.update_greedy_actions((x, y)) def update_greedy_actions(self, state): maxActionValue = max(self.Qvalues[state].values()) self.greedyActions[state] = [ action for action, value in self.Qvalues[state].items() if value == maxActionValue ] def set_Q(self, S, A, value): self.Qvalues[S][A] = value self.update_greedy_actions(state=S) def initialize_stateActionPairCounts(self): for x in range(self.stateActionPairCounts.shape[0]): for y in range(self.stateActionPairCounts.shape[1]): self.stateActionPairCounts[x, y] = { action: 0 for action in self.ACTIONSPACE } def operate(self): if self.get_memory_size() >= self.nStepVar.get() >= 1 or ( self.episodeFinished and self.get_memory_size()): # First condition is never True for MC self.process_earliest_memory() return self.UPDATED_BY_EXPERIENCE elif self.episodeFinished: self.episodeReturns.append(self.return_) self.hasMadeExploratoryMove = False # So at the next start the agent isnt colored exploratory anymore self.state = self.environment.remove_agent() self.episodeFinished = False return self.FINISHED_EPISODE elif self.state is None: self.start_episode() return self.STARTED_EPISODE elif self.iSuccessivePlannings < self.nPlanVar.get(): self.plan( ) # TODO: Model Algo needs no Memory and doesnt need to pass a target action to the behavior action. Nevertheless, expected version is possible. self.iSuccessivePlannings += 1 return self.UPDATED_BY_PLANNING else: self.take_action() return self.TOOK_ACTION def start_episode(self): self.targetAction = None self.return_ = 0 self.iSuccessivePlannings = 0 self.state = self.environment.give_initial_position() if self.state is None: raise RuntimeError("No Starting Point found") def take_action(self): self.iSuccessivePlannings = 0 behaviorAction = self.generate_behavior_action() reward, successorState, self.episodeFinished = self.environment.apply_action( behaviorAction ) # This is the only place where the agent exchanges information with the environment self.hasMadeExploratoryMove = self.hasChosenExploratoryMove # if hasChosenExploratoryMove would be the only indicator for changing the agent color in the next visualization, then in the on-policy case, if the target was chosen to be an exploratory move in the last step-call, the coloring would happen BEFORE the move was taken, since in this line, the behavior action would already be determined and just copied from that target action with no chance to track if it was exploratory or not. self.memory.memorize(self.state, behaviorAction, reward) self.return_ += reward # underscore at the end because "return" is a python keyword self.state = successorState # must happen after memorize and before generate_target! self.generate_target() self.behaviorPolicy.decay_epsilon() self.targetPolicy.decay_epsilon() # self.actionHistory.append(behaviorAction) TODO: Dont forget debug stuff here # print(self.actionHistory) def generate_behavior_action(self): if self.onPolicyVar.get() and self.targetAction: # In this case, the target action was chosen by the behavior policy (which is the only policy in on-policy) beforehand. return self.targetAction else: # This will be executed if one of the following applies: # ...the updates are off policy, so the behavior action will NOT be copied from a previously chosen target action. # ...there is no recent target action because: the value used for the latest update was an expectation OR no update happened in this episode so far. return self.behaviorPolicy.generate_action(self.state) def generate_target(self): if self.episodeFinished: self.targetAction = None self.targetActionvalue = 0 # per definition return if self.onPolicyVar.get(): policy = self.behaviorPolicy else: policy = self.targetPolicy if self.updateByExpectationVar.get(): self.targetAction = None # Otherwise, if switched dynamically to expectation during an episode, in the On-Policy case, the action selected in the else-block below would be copied and used as the behavior action in every following turn, resulting in an agent that cannot change its direction anymore self.targetActionvalue = policy.get_expected_actionvalue( self.state) else: self.targetAction = policy.generate_action(self.state) self.targetActionvalue = self.get_Q(S=self.state, A=self.targetAction) def process_earliest_memory(self): self.update_actionvalue() self.memory.forget_oldest_memory() def update_actionvalue(self): # step by step, so you can watch exactly whats happening when using a debugger discountedRewardSum = self.memory.get_discountedRewardSum() correspondingState, actionToUpdate, _ = self.memory.get_oldest_memory() Qbefore = self.get_Q(S=correspondingState, A=actionToUpdate) discountedTargetActionValue = cached_power( self.discountVar.get(), self.nStepVar.get() ) * self.targetActionvalue # in the MC case (n is -1 here) the targetActionvalue is zero anyway, so it doesnt matter what n is. returnEstimate = discountedRewardSum + discountedTargetActionValue TD_error = returnEstimate - Qbefore if self.dynamicAlphaVar.get(): self.stateActionPairCounts[correspondingState][actionToUpdate] += 1 self.learningRateVar.set( 1 / self.stateActionPairCounts[correspondingState][actionToUpdate]) update = self.learningRateVar.get() * TD_error Qafter = Qbefore + update self.set_Q(S=correspondingState, A=actionToUpdate, value=Qafter) def plan(self): print(self.iSuccessivePlannings) def get_discount(self): return self.discountVar.get() def get_episodeReturns(self): return self.episodeReturns def get_state(self): return self.state def get_Qvalues(self): return self.Qvalues def get_greedyActions(self): return self.greedyActions def get_Q(self, S, A): return self.Qvalues[S][A] def get_targetAction(self): return self.targetAction def get_memory_size(self): return self.memory.get_size()