# Environment sizeX = 4 sizeY = 4 defaultReward = -1.0 terminalStates = [(0, 0), (3, 3)] # Agent gamma = 1.0 thresh_convergence = 1e-30 n = 5 alpha_TDn = 0.01 alpha_TD = 0.01 alpha_sumTDError = 0.01 env = DeterministicGridWorld(sizeX, sizeY, defaultReward=defaultReward, terminalStates=terminalStates) policy = StochasticPolicy(env.nStates, env.nActions) agent_PE = PolicyEvaluation(env.nStates, env.nActions, gamma, thresh_convergence, env.computeExpectedValue) # TD agent to validate the TDn implementation agent_TD = TDPrediction(env.nStates, alpha_TD, gamma) agent_TDn = nStepTDPrediction(env.nStates, alpha_TDn, gamma, n) env.printEnv() # Policy evaluation for reference for e in range(nEpisodes): deltaMax, isConverged = agent_PE.evaluate(policy)
epsilon_SARSA = 0.1 epsilon_QLearning = 0.1 epsilon_ExpectedSARSA = 0.1 epsilon_DoubleQLearning = 0.1 avg_reward_sums_SARSA = np.zeros(nEpisodes) avg_reward_sums_QLearning = np.zeros(nEpisodes) avg_reward_sums_ExpectedSARSA = np.zeros(nEpisodes) avg_reward_sums_DoubleQLearning = np.zeros(nEpisodes) for idx_experiment in range(1, nExperiments + 1): print("Experiment : ", idx_experiment) env = DeterministicGridWorld(sizeX, sizeY, specialRewards=specialRewards, defaultReward=defaultReward, terminalStates=terminalStates, startStates=startStates) agent_SARSA = SARSA(env.nStates, env.nActions, alpha_SARSA, gamma_SARSA, epsilon=epsilon_SARSA) agent_QLearning = QLearning(env.nStates, env.nActions, alpha_QLearning, gamma_QLearning, epsilon=epsilon_QLearning) agent_ExpectedSARSA = ExpectedSARSA(env.nStates, env.nActions,
gamma_DynaQ_ASB = 0.95 epsilon_DynaQ_ASB = 0.2 nPlanningSteps_DynaQ_ASB = 5 kappa_ASB = 0.001 avg_cum_reward_DynaQ = np.zeros(nTimesteps) avg_cum_reward_DynaQPlus = np.zeros(nTimesteps) avg_cum_reward_DynaQ_ASB = np.zeros(nTimesteps) for idxExperiment in range(1, nExperiments + 1): print("Experiment:", idxExperiment) env = DeterministicGridWorld(sizeX, sizeY, startStates=startStates, terminalStates=terminalStates, impassableStates=impassableStates, defaultReward=defaultReward, specialRewards=specialRewards) agent_DynaQ = DynaQ(env.nStates, env.nActions, alpha_DynaQ, gamma_DynaQ, nPlanningSteps_DynaQ, kappa=kappa_DynaQ, epsilon=epsilon_DynaQ) cumulative_reward_DynaQ, nStepsPerEpisode_DynaQ = runExperiment( nEpisodes, nTimesteps, env, agent_DynaQ, envChangeTimestep) avg_cum_reward_DynaQ += (1.0 / idxExperiment) * ( cumulative_reward_DynaQ - avg_cum_reward_DynaQ)
nEpisodes = 1000 # Environment sizeX = 5 sizeY = 5 defaultReward = 0.0 outOfGridReward = -1.0 specialRewards = {((1,0),0):10,((1,0),1):10,((1,0),2):10,((1,0),3):10, ((3,0),0):5, ((3,0),1):5, ((3,0),2):5, ((3,0),3):5} specialStateTransitions = {(1,0):(1,4), (3,0):(3,2)} # Agent gamma = 0.9 thresh_convergence = 1e-30 env = DeterministicGridWorld(sizeX, sizeY, specialRewards=specialRewards, specialStateTransitions=specialStateTransitions, defaultReward=defaultReward, outOfGridReward=outOfGridReward) agent = PolicyIteration(env.nStates, env.nActions, gamma, thresh_convergence, env.computeExpectedValue) env.printEnv() for e in range(nEpisodes): deltaMax, isConverged, isPolicyStable = agent.update() print("Episode : ", e, " Delta: ", deltaMax, " isConverged: ", isConverged, " isPolicyStable: ", isPolicyStable) print() printStr = "" for y in range(sizeY): for x in range(sizeX): i = env.getLinearIndex(x,y) printStr = printStr + "{:.2f}".format(agent.valueTable[i]) + "\t"
if __name__ == "__main__": nEpisodes = 1000 # Environment sizeX = 4 sizeY = 4 defaultReward = -1.0 terminalStates = [(0, 0), (3, 3)] # Agent gamma = 1.0 thresh_convergence = 1e-30 env = DeterministicGridWorld(sizeX, sizeY, defaultReward=defaultReward, terminalStates=terminalStates) policy = StochasticPolicy(env.nStates, env.nActions) agent = PolicyEvaluation(env.nStates, env.nActions, gamma, thresh_convergence, env.computeExpectedValue) env.printEnv() for e in range(nEpisodes): deltaMax, isConverged = agent.evaluate(policy) print("Episode : ", e, " Delta: ", deltaMax) printStr = "" for y in range(sizeY): for x in range(sizeX):
if __name__=="__main__": nEpisodes = 1000 # Environment sizeX = 4 sizeY = 5 defaultReward = -1.0 terminalStates= [(0,0), (3,3)] doUnblockAdditionalState = True # Set true for the second part of exercise 4.2 # Agent gamma = 0.9 thresh_convergence = 1e-30 env = DeterministicGridWorld(sizeX, sizeY, defaultReward=defaultReward, terminalStates=terminalStates) # First part of exercise 4.2: addition of new state env.stateTransitionProbs[12,:,16:]=0.0 env.stateTransitionProbs[13,:,16:]=0.0 env.stateTransitionProbs[14,:,16:]=0.0 env.stateTransitionProbs[15,:,16:]=0.0 env.stateTransitionProbs[16,:,:]=0.0 env.stateTransitionProbs[18,:,:]=0.0 env.stateTransitionProbs[19,:,:]=0.0 # State 15 of the question is indexed as 17 env.stateTransitionProbs[17,:,:]=0.0 env.stateTransitionProbs[17,0,13]=1.0 #N env.stateTransitionProbs[17,1,17]=1.0 #S
print("Scale : ", scale) sizeX_scaled, sizeY_scaled, startStates_scaled, terminalStates_scaled, impassableStates_scaled = scaleEnv( scale, sizeX, sizeY, startStates, terminalStates, impassableStates_bounds) specialRewards_scaled = { ((terminalStates_scaled[0][0], terminalStates_scaled[0][1] + 1), 0): 1.0, ((terminalStates_scaled[0][0] - 1, terminalStates_scaled[0][1]), 2): 1.0 } env = DeterministicGridWorld(sizeX_scaled, sizeY_scaled, startStates=startStates_scaled, terminalStates=terminalStates_scaled, impassableStates=impassableStates_scaled, defaultReward=defaultReward, specialRewards=specialRewards_scaled) env.printEnv() agent_PS = PrioritizedSweeping(env.nStates, env.nActions, alpha_PS, gamma_PS, nPlanningSteps_PS, theta=theta_PS, epsilon=epsilon_PS) cumulative_reward_PS, nStepsPerEpisode_PS, nUpdates_PS = runExperiment( nTimesteps,