# Environment
    sizeX = 4
    sizeY = 4
    defaultReward = -1.0
    terminalStates = [(0, 0), (3, 3)]

    # Agent
    gamma = 1.0
    thresh_convergence = 1e-30
    n = 5
    alpha_TDn = 0.01
    alpha_TD = 0.01
    alpha_sumTDError = 0.01

    env = DeterministicGridWorld(sizeX,
                                 sizeY,
                                 defaultReward=defaultReward,
                                 terminalStates=terminalStates)
    policy = StochasticPolicy(env.nStates, env.nActions)
    agent_PE = PolicyEvaluation(env.nStates, env.nActions, gamma,
                                thresh_convergence, env.computeExpectedValue)

    # TD agent to validate the TDn implementation
    agent_TD = TDPrediction(env.nStates, alpha_TD, gamma)
    agent_TDn = nStepTDPrediction(env.nStates, alpha_TDn, gamma, n)

    env.printEnv()

    # Policy evaluation for reference
    for e in range(nEpisodes):
        deltaMax, isConverged = agent_PE.evaluate(policy)
Beispiel #2
0
    epsilon_SARSA = 0.1
    epsilon_QLearning = 0.1
    epsilon_ExpectedSARSA = 0.1
    epsilon_DoubleQLearning = 0.1

    avg_reward_sums_SARSA = np.zeros(nEpisodes)
    avg_reward_sums_QLearning = np.zeros(nEpisodes)
    avg_reward_sums_ExpectedSARSA = np.zeros(nEpisodes)
    avg_reward_sums_DoubleQLearning = np.zeros(nEpisodes)
    for idx_experiment in range(1, nExperiments + 1):

        print("Experiment : ", idx_experiment)

        env = DeterministicGridWorld(sizeX,
                                     sizeY,
                                     specialRewards=specialRewards,
                                     defaultReward=defaultReward,
                                     terminalStates=terminalStates,
                                     startStates=startStates)

        agent_SARSA = SARSA(env.nStates,
                            env.nActions,
                            alpha_SARSA,
                            gamma_SARSA,
                            epsilon=epsilon_SARSA)
        agent_QLearning = QLearning(env.nStates,
                                    env.nActions,
                                    alpha_QLearning,
                                    gamma_QLearning,
                                    epsilon=epsilon_QLearning)
        agent_ExpectedSARSA = ExpectedSARSA(env.nStates,
                                            env.nActions,
    gamma_DynaQ_ASB = 0.95
    epsilon_DynaQ_ASB = 0.2
    nPlanningSteps_DynaQ_ASB = 5
    kappa_ASB = 0.001

    avg_cum_reward_DynaQ = np.zeros(nTimesteps)
    avg_cum_reward_DynaQPlus = np.zeros(nTimesteps)
    avg_cum_reward_DynaQ_ASB = np.zeros(nTimesteps)
    for idxExperiment in range(1, nExperiments + 1):

        print("Experiment:", idxExperiment)

        env = DeterministicGridWorld(sizeX,
                                     sizeY,
                                     startStates=startStates,
                                     terminalStates=terminalStates,
                                     impassableStates=impassableStates,
                                     defaultReward=defaultReward,
                                     specialRewards=specialRewards)
        agent_DynaQ = DynaQ(env.nStates,
                            env.nActions,
                            alpha_DynaQ,
                            gamma_DynaQ,
                            nPlanningSteps_DynaQ,
                            kappa=kappa_DynaQ,
                            epsilon=epsilon_DynaQ)
        cumulative_reward_DynaQ, nStepsPerEpisode_DynaQ = runExperiment(
            nEpisodes, nTimesteps, env, agent_DynaQ, envChangeTimestep)
        avg_cum_reward_DynaQ += (1.0 / idxExperiment) * (
            cumulative_reward_DynaQ - avg_cum_reward_DynaQ)
  nEpisodes = 1000

  # Environment
  sizeX = 5
  sizeY = 5
  defaultReward = 0.0
  outOfGridReward = -1.0
  specialRewards = {((1,0),0):10,((1,0),1):10,((1,0),2):10,((1,0),3):10, 
    ((3,0),0):5, ((3,0),1):5, ((3,0),2):5, ((3,0),3):5}
  specialStateTransitions = {(1,0):(1,4), (3,0):(3,2)}

  # Agent
  gamma = 0.9
  thresh_convergence = 1e-30

  env = DeterministicGridWorld(sizeX, sizeY, specialRewards=specialRewards, 
    specialStateTransitions=specialStateTransitions, defaultReward=defaultReward, outOfGridReward=outOfGridReward)
  agent = PolicyIteration(env.nStates, env.nActions, gamma, thresh_convergence, env.computeExpectedValue)
  
  env.printEnv()
    
  for e in range(nEpisodes):
    deltaMax, isConverged, isPolicyStable = agent.update()
    
    print("Episode : ", e, " Delta: ", deltaMax, " isConverged: ", isConverged, " isPolicyStable: ", isPolicyStable)
    
    print()
    printStr = ""
    for y in range(sizeY):
      for x in range(sizeX):
        i = env.getLinearIndex(x,y)
        printStr = printStr + "{:.2f}".format(agent.valueTable[i]) + "\t"
Beispiel #5
0
if __name__ == "__main__":

    nEpisodes = 1000

    # Environment
    sizeX = 4
    sizeY = 4
    defaultReward = -1.0
    terminalStates = [(0, 0), (3, 3)]

    # Agent
    gamma = 1.0
    thresh_convergence = 1e-30

    env = DeterministicGridWorld(sizeX,
                                 sizeY,
                                 defaultReward=defaultReward,
                                 terminalStates=terminalStates)
    policy = StochasticPolicy(env.nStates, env.nActions)
    agent = PolicyEvaluation(env.nStates, env.nActions, gamma,
                             thresh_convergence, env.computeExpectedValue)

    env.printEnv()

    for e in range(nEpisodes):
        deltaMax, isConverged = agent.evaluate(policy)

        print("Episode : ", e, " Delta: ", deltaMax)

        printStr = ""
        for y in range(sizeY):
            for x in range(sizeX):
Beispiel #6
0
if __name__=="__main__":

  nEpisodes = 1000

  # Environment
  sizeX = 4
  sizeY = 5
  defaultReward = -1.0
  terminalStates= [(0,0), (3,3)]
  doUnblockAdditionalState = True   # Set true for the second part of exercise 4.2
  
  # Agent
  gamma = 0.9
  thresh_convergence = 1e-30

  env = DeterministicGridWorld(sizeX, sizeY, defaultReward=defaultReward, terminalStates=terminalStates)
  
  # First part of exercise 4.2: addition of new state
  env.stateTransitionProbs[12,:,16:]=0.0
  env.stateTransitionProbs[13,:,16:]=0.0
  env.stateTransitionProbs[14,:,16:]=0.0
  env.stateTransitionProbs[15,:,16:]=0.0

  env.stateTransitionProbs[16,:,:]=0.0
  env.stateTransitionProbs[18,:,:]=0.0
  env.stateTransitionProbs[19,:,:]=0.0

  # State 15 of the question is indexed as 17 
  env.stateTransitionProbs[17,:,:]=0.0
  env.stateTransitionProbs[17,0,13]=1.0 #N
  env.stateTransitionProbs[17,1,17]=1.0 #S
Beispiel #7
0
        print("Scale : ", scale)

        sizeX_scaled, sizeY_scaled, startStates_scaled, terminalStates_scaled, impassableStates_scaled = scaleEnv(
            scale, sizeX, sizeY, startStates, terminalStates,
            impassableStates_bounds)
        specialRewards_scaled = {
            ((terminalStates_scaled[0][0], terminalStates_scaled[0][1] + 1), 0):
            1.0,
            ((terminalStates_scaled[0][0] - 1, terminalStates_scaled[0][1]), 2):
            1.0
        }
        env = DeterministicGridWorld(sizeX_scaled,
                                     sizeY_scaled,
                                     startStates=startStates_scaled,
                                     terminalStates=terminalStates_scaled,
                                     impassableStates=impassableStates_scaled,
                                     defaultReward=defaultReward,
                                     specialRewards=specialRewards_scaled)

        env.printEnv()

        agent_PS = PrioritizedSweeping(env.nStates,
                                       env.nActions,
                                       alpha_PS,
                                       gamma_PS,
                                       nPlanningSteps_PS,
                                       theta=theta_PS,
                                       epsilon=epsilon_PS)
        cumulative_reward_PS, nStepsPerEpisode_PS, nUpdates_PS = runExperiment(
            nTimesteps,