Example #1
0
 def __init__(self,
              nStates,
              nActions,
              alpha,
              gamma,
              n,
              sigma,
              policyUpdateMethod="esoft",
              epsilon=0.1,
              tieBreakingMethod="arbitrary",
              valueInit="zeros"):
     super().__init__(nStates,
                      nActions,
                      alpha,
                      gamma,
                      n,
                      valueInit=valueInit)
     self.name = "n-step Q-sigma"
     self.sigma = sigma
     self.policy = StochasticPolicy(
         self.nStates,
         self.nActions,
         policyUpdateMethod=policyUpdateMethod,
         epsilon=epsilon,
         tieBreakingMethod=tieBreakingMethod)  # TODO
Example #2
0
 def __init__(self, nStates, nActions, gamma, policyUpdateMethod="greedy", epsilon=0.0, tieBreakingMethod="arbitrary"):
   self.name = "Generic Monte Carlo Control Agent"
   self.nStates = nStates
   self.nActions = nActions
   self.gamma = gamma
   self.actionValueTable = np.zeros([self.nStates, self.nActions], dtype=float)
   self.policy = StochasticPolicy(self.nStates, self.nActions, policyUpdateMethod=policyUpdateMethod,
     epsilon=epsilon, tieBreakingMethod=tieBreakingMethod)
 def __init__(self, nStates, nActions, alpha, doUseBaseline=True):
     self.nStates = nStates
     self.nActions = nActions
     self.alpha = alpha
     self.doUseBaseline = doUseBaseline
     self.preferencesTable = np.zeros([self.nStates, self.nActions],
                                      dtype=float) + 0.0001
     self.policy = StochasticPolicy(self.nStates,
                                    self.nActions,
                                    policyUpdateMethod="softmax",
                                    tieBreakingMethod="consistent")
     self.count = 0
     self.avgReward = 0.0
Example #4
0
 def __init__(self,
              nStates,
              nActions,
              alpha,
              gamma,
              actionSelectionMethod="esoft",
              epsilon=0.01,
              tieBreakingMethod="arbitrary",
              valueInit="zeros"):
     super().__init__(nStates, nActions, alpha, gamma, valueInit=valueInit)
     self.name = "Expected SARSA"
     self.policy = StochasticPolicy(self.nStates,
                                    self.nActions,
                                    policyUpdateMethod="esoft",
                                    epsilon=epsilon,
                                    tieBreakingMethod=tieBreakingMethod)
Example #5
0
 def __init__(self,
              nStates,
              nActions,
              alpha,
              gamma,
              n,
              valueInit="zeros",
              policyUpdateMethod="greedy",
              epsilon=0.0,
              tieBreakingMethod="consistent"):
     super().__init__(nStates, alpha, gamma, n, valueInit=valueInit)
     self.name = "n-step Per-Decision TD Prediction"
     self.nActions = nActions
     self.policy = StochasticPolicy(self.nStates,
                                    self.nActions,
                                    policyUpdateMethod=policyUpdateMethod,
                                    epsilon=epsilon,
                                    tieBreakingMethod=tieBreakingMethod)
Example #6
0
  alpha_DP = 0.01 
  gamma_DP = 0.99
  thresh_convergence = 1e-10
  
  alpha_QL = 0.01
  gamma_QL = 0.99

  alpha_GTD = 0.005 
  beta_GTD = 0.05
  gamma_GTD = 0.99
  
  alpha_ETD = 0.03
  gamma_ETD = 0.99
  
  env = BairdsCounterExample()
  behaviour_policy = StochasticPolicy(env.nStates, env.nActions)
  behaviour_policy.actionProbabilityTable[:,env.ACTION_IDX_DASHED] = 6.0/7.0
  behaviour_policy.actionProbabilityTable[:,env.ACTION_IDX_SOLID] = 1.0/7.0

  target_policy = StochasticPolicy(env.nStates, env.nActions)
  target_policy.actionProbabilityTable[:,:] = 0.0
  target_policy.actionProbabilityTable[:,env.ACTION_IDX_SOLID] = 1.0
  
  stateEncodingMatrix = np.zeros([env.nStates, nParams])
  for i in range(env.nStates-1):
    stateEncodingMatrix[i,i] = 2
    stateEncodingMatrix[i,7] = 1
  stateEncodingMatrix[6,6] = 1
  stateEncodingMatrix[6,7] = 2
  approximationFunctionArgs = {'af':linearTransform, 'afd':dLinearTransform, 'ftf':FixedStateEncoding, 'stateEncodingMatrix':stateEncodingMatrix}
    defaultReward = -1.0
    terminalStates = [(0, 0), (3, 3)]

    # Agent
    gamma = 1.0
    thresh_convergence = 1e-30
    n = 5
    alpha_TDn = 0.01
    alpha_TD = 0.01
    alpha_sumTDError = 0.01

    env = DeterministicGridWorld(sizeX,
                                 sizeY,
                                 defaultReward=defaultReward,
                                 terminalStates=terminalStates)
    policy = StochasticPolicy(env.nStates, env.nActions)
    agent_PE = PolicyEvaluation(env.nStates, env.nActions, gamma,
                                thresh_convergence, env.computeExpectedValue)

    # TD agent to validate the TDn implementation
    agent_TD = TDPrediction(env.nStates, alpha_TD, gamma)
    agent_TDn = nStepTDPrediction(env.nStates, alpha_TDn, gamma, n)

    env.printEnv()

    # Policy evaluation for reference
    for e in range(nEpisodes):
        deltaMax, isConverged = agent_PE.evaluate(policy)

        print("Episode : ", e, " Delta: ", deltaMax)
Example #8
0
        agent_nStepSARSA = nStepSARSA(env.nStates,
                                      env.nActions,
                                      alpha_nStepSARSA,
                                      gamma_nStepSARSA,
                                      n_nStepSARSA,
                                      epsilon=epsilon_nStepSARSA)
        print("running:", agent_nStepSARSA.getName())
        cum_reward_nStepSARSA, nStepsPerEpisode_nStepSARSA = runExperiment(
            nEpisodes, env, agent_nStepSARSA)

        agent_nStepTB = nStepTreeBackup(env.nStates, env.nActions,
                                        alpha_nStepTB, gamma_nStepTB,
                                        n_nStepTB)
        print("running:", agent_nStepTB.getName())
        policy_behaviour = StochasticPolicy(env.nStates,
                                            env.nActions,
                                            policyUpdateMethod="esoft",
                                            epsilon=epsilon_behaviourPolicy)
        cum_reward_nStepTB, nStepsPerEpisode_nStepTB = runExperiment(
            nEpisodes, env, agent_nStepTB, policy_behaviour,
            doUpdateBehaviourPolicy)

        agent_nStepQSigma = nStepQSigma(env.nStates, env.nActions,
                                        alpha_nStepQSigma, gamma_nStepQSigma,
                                        n_nStepQSigma, sigma_nStepQSigma)
        print("running:", agent_nStepQSigma.getName())
        policy_behaviour = StochasticPolicy(env.nStates,
                                            env.nActions,
                                            policyUpdateMethod="esoft",
                                            epsilon=epsilon_behaviourPolicy)
        cum_reward_nStepQSigma, nStepsPerEpisode_nStepQSigma = runExperiment(
            nEpisodes, env, agent_nStepQSigma, policy_behaviour,
Example #9
0
from mpl_toolkits.mplot3d import Axes3D

from IRL.environments.Gambler import Blackjack
from IRL.agents.MonteCarlo import MonteCarloOffPolicyPrediction
from IRL.utils.Policies import StochasticPolicy

if __name__ == "__main__":

    nEpisodes = 500000

    # Agent
    gamma = 1.0

    env = Blackjack()
    agent = MonteCarloOffPolicyPrediction(env.nStates, env.nActions, gamma)
    policy_behaviour = StochasticPolicy(env.nStates, env.nActions)
    for i in range(env.nStatesPlayerSum - 1, -1, -1):
        for j in range(env.nStatesDealerShowing):
            for k in [env.USABLE_ACE_YES, env.USABLE_ACE_NO]:
                idx_state = env.getLinearIndex(env.minPlayerSum + i,
                                               env.minDealerShowing + j, k)
                if (env.minPlayerSum + i < 20):
                    actionProb = np.zeros(env.nActions)
                    actionProb[env.ACTION_HIT] = 1.0
                    agent.policy.update(idx_state, actionProb)
                else:
                    actionProb = np.zeros(env.nActions)
                    actionProb[env.ACTION_STICK] = 1.0
                    agent.policy.update(idx_state, actionProb)

    #env.printEnv()