def train_and_save_SAR_MDP(track_str, starts, gamma): test_track = Track(track_str + '.txt') test_mdp = MDP(test_track, algorithm='SARSA') SARSA.sarsa(test_mdp, starts, gamma=gamma) with open(f'pickles\{track_str}_{gamma}_SAR_pickle', 'wb') as file: pickle.dump(test_mdp, file) print('Training Complete')
def __init__(self, alpha, epsilon, gamma, actionList, size): self.size = self.width, self.height = size self.actionList = actionList self.epsilon = epsilon self.agent = [ SARSA.SARSA(alpha, epsilon, gamma, actionList), #all SARSA.SARSA(alpha, epsilon, gamma, actionList), #world SARSA.SARSA(alpha, epsilon, gamma, actionList), #turtle SARSA.SARSA(alpha, epsilon, gamma, actionList), #coin ]
class LSPI_SARSA(Agent): def __init__(self,representation,policy,domain,logger, lspi_iterations = 5, sample_window = 100, epsilon = 1e-3, re_iterations = 100,initial_alpha =.1, lambda_=0,alpha_decay_mode ='dabney', boyan_N0 = 1000): self.SARSA = SARSA(representation, policy, domain,logger, initial_alpha, lambda_,alpha_decay_mode, boyan_N0) self.LSPI = LSPI(representation,policy,domain,logger, lspi_iterations, sample_window, epsilon, re_iterations) super(LSPI_SARSA,self).__init__(representation,policy,domain,logger) def learn(self,s,a,r,ns,na,terminal): self.LSPI.process(s,a,r,ns,na,terminal) if self.LSPI.samples_count+1 % self.LSPI.steps_between_LSPI == 0: self.LSPI.representationExpansionLSPI() if terminal: self.episodeTerminated() else: self.SARSA.learn(s,a,r,ns,na,terminal)
def read_argument(): parser = argparse.ArgumentParser('Reinforcement Learning') parser.add_argument('goal_state_reward', type=float, help='The reward for reaching the goal state') parser.add_argument('pit_fall_reward', type=float, help='The reward for falling into a pit') parser.add_argument('move_reward', type=float, help='The reward for moving') parser.add_argument('give_up_reward', type=float, help='The reward for giving up') parser.add_argument('number_of_trials', type=int, help='The number of learning trials to run') parser.add_argument('exploration_epsilon', type=float, help='The weight for exploration') args = vars(parser.parse_args()) env = environment.Environment( args['goal_state_reward'], args['pit_fall_reward'], args['move_reward'], args['give_up_reward']) sarsa = SARSA.SARSA( env, args['number_of_trials'], args['exploration_epsilon'] ) return env, sarsa
import EmptySARSA import HORDQ import SARSA if __name__ == "__main__": alpha = 0.2 epsilon = 0.1 gamma = 0.9 #ob = (-1, -1, -1, (1, 2)) #ob2 = (-1, -1, -1, (5, 4)) #ob3 = (-1, -1, -1, (0, 0)) ob4 = (-1, -1, -1, (1, 1)) punishment = 10 isRORDQ = False hordQ = HORDQ.HORDQ(alpha, epsilon, gamma, [1, -1], isRORDQ) probQ = SARSA.SARSA(alpha, epsilon, gamma, [1, -1]) controller = RMax(epsilon, gamma, hordQ, probQ, punishment) #unit test for get room val = controller.getRoom(ob4) print "value: ", val assert( val == 0) #controller.start(ob) #for i in range(0, 1000): # #controller.step(1, ob) #controller.step(1, ob) #controller.step(1, ob) #controller.step(1, ob2) #controller.end(10)
def BusRun(type, punishment, maxStep, isRORDQ, isRandomPlanner, isShow, framRate, loadFile): discrete_size = 6 objSet = (1, 1) monsterMoveProb = 0.3 isEpisodeEnd = False #maxStep = 200000 size = 800, 800 gridSize = (discrete_size, discrete_size) delay = 100 interval = 50 pygame.init() pygame.key.set_repeat(delay, interval) clock = pygame.time.Clock() screen = pygame.display.set_mode(size) actionList = ((0, 1), (0, -1), (1, 0), (-1, 0)) #controller = RelationalQ.RelationalQ(0.05, 0.1, 0.9, actionList) alpha = 0.2 probAlpha = 0.1 gamma = 1 if type == 'SARSA': epsilon = 0.1 controller = SARSA.SARSA(alpha, epsilon, gamma, actionList) else: epsilon = 0.05 #isRORDQ = True hordQ = HORDQ.HORDQ(alpha, epsilon, gamma, actionList, isRORDQ) probQ = SARSA.SARSA(probAlpha, epsilon, gamma, [0]) if isRandomPlanner: epsilon = 1 controller = RMax.RMax(epsilon, gamma, hordQ, probQ, punishment) if loadFile != '': print "load:", loadFile controller = tool.Load(loadFile) env = BusEnv((discrete_size, discrete_size), size, actionList) numOfTurtle = objSet[0] numOfCoin = objSet[1] print "# coin ", numOfCoin print "# Turtle ", numOfTurtle print "isEpisodeEnd ", isEpisodeEnd isTraining = not isEpisodeEnd count = 0 totalReward = 0 rewardList = [] stepCount = 0 while stepCount < maxStep: #randomly choose a sub goal at the beginning of the episode goalDiff = actionList[int(random.random() * len(actionList))] world = env.start(numOfTurtle, numOfCoin) action = controller.start(env.getSarsaFeature()) count += 1 prevStepCount = stepCount episodeReward = 0 while stepCount < maxStep: if stepCount % 1000 == 0: print "Time: ", stepCount / 1000 stepCount = stepCount + 1 clock.tick(frameRate) reward, flag = env.step(action) fea = env.getSarsaFeature() totalReward = totalReward + reward episodeReward = episodeReward + reward if flag: controller.end(reward) break action = controller.step(reward, fea) for event in pygame.event.get(): #action = 0 if event.type == pygame.QUIT: sys.exit() if isShow: screen.blit(env.getScreen(), (0, 0)) pygame.display.flip() rewardList.append((prevStepCount, stepCount, episodeReward)) print totalReward #for conf in controller.agent: #print controller.agent[conf].Q #controller.dumpObj() #controller.dumpCoinAndGoal() #controller.dumpCoinAndGoalEx(controller.prob) #controller.dumpCoinAndGoalEx(controller.realReward) tool.Save(controller, type) tool.Save(rewardList, 'reward_' + type)
import matplotlib.pyplot as plt import numpy as np import SARSA def moving_average(a, n=3): ret = np.cumsum(a, dtype=float) ret[n:] = ret[n:] - ret[:-n] return ret[n - 1:] / n S_rewards = SARSA.run(50000) S_plot = moving_average(S_rewards, 5000) # smooth plot with a moving average plt.plot(S_plot) plt.show()
def __init__(self,representation,policy,domain,logger, lspi_iterations = 5, sample_window = 100, epsilon = 1e-3, re_iterations = 100,initial_alpha =.1, lambda_=0,alpha_decay_mode ='dabney', boyan_N0 = 1000): self.SARSA = SARSA(representation, policy, domain,logger, initial_alpha, lambda_,alpha_decay_mode, boyan_N0) self.LSPI = LSPI(representation,policy,domain,logger, lspi_iterations, sample_window, epsilon, re_iterations) super(LSPI_SARSA,self).__init__(representation,policy,domain,logger)