Example #1
0
    def __init__(self, nParams, nActions, approximationFunctionArgs, **kwargs):
        self.nParams = nParams
        self.nActions = nActions
        self.af_kwargs = approximationFunctionArgs
        self.af = getValueFromDict(self.af_kwargs, "af")
        self.actionSelectionMethod = getValueFromDict(kwargs,
                                                      "actionSelectionMethod",
                                                      "greedy")
        self.epsilon = getValueFromDict(kwargs, "epsilon", 0.0)
        self.tieBreakingMethod = getValueFromDict(kwargs, "tieBreakingMethod",
                                                  "consistent")
        self.w = np.zeros([self.nParams], dtype=float)

        if (self.tieBreakingMethod == "arbitrary"):
            self.argmax_function = Numeric.argmax
        elif (self.tieBreakingMethod == "consistent"):
            self.argmax_function = np.argmax
        else:
            sys.exit(
                "ERROR: FunctionApproximationPolicy: tieBreakingMethod not recognized!"
            )

        if (self.actionSelectionMethod == "egreedy"):
            self.actionSelection_function = selectAction_egreedy
        elif (self.actionSelectionMethod == "softmax"):
            self.actionSelection_function = selectAction_softmax
        elif (self.actionSelectionMethod == "greedy"):
            self.actionSelection_function = selectAction_greedy
        elif (self.actionSelectionMethod == "esoft"):
            self.actionSelection_function = selectAction_esoft
        else:
            sys.exit(
                "ERROR: FunctionApproximationPolicy: actionSelectionMethod not recognized!"
            )
 def __init__(self,
              nParams,
              nActions,
              alpha,
              gamma,
              lambd,
              approximationFunctionArgs,
              actionSelectionMethod="egreedy",
              epsilon=0.01):
     self.name = "True Online SARSA"
     self.nParams = nParams
     self.nActions = nActions
     self.alpha = alpha
     self.gamma = gamma
     self.lambd = lambd
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.ftf = getValueFromDict(self.af_kwargs, "ftf")
     self.w = np.zeros([self.nParams], dtype=np.float)
     self.z = np.zeros([self.nParams], dtype=np.float)
     self.q_old = 0.0
     self.policy = FunctionApproximationPolicy(
         self.nParams,
         self.nActions,
         self.af_kwargs,
         actionSelectionMethod=actionSelectionMethod,
         epsilon=epsilon)
Example #3
0
def softmaxLinear(w, state, action=None, **kwargs):
    ftf = getValueFromDict(kwargs, "ftf")
    nActions = getValueFromDict(kwargs, "nActions")
    p = normalize_softmax(
        np.array(
            [np.dot(w.T, ftf(state, a, **kwargs)) for a in range(nActions)]))
    return p if action is None else p[action]
 def __init__(self,
              nParams,
              nActions,
              alpha,
              gamma,
              lambd,
              approximationFunctionArgs,
              doAccumulateTraces=False,
              doClearTraces=False,
              actionSelectionMethod="egreedy",
              epsilon=0.01):
     self.name = "SARSA(Lambda)"
     self.nParams = nParams
     self.nActions = nActions
     self.alpha = alpha
     self.gamma = gamma
     self.lambd = lambd
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.ftf = getValueFromDict(self.af_kwargs, "ftf")
     self.doAccumulateTraces = doAccumulateTraces
     self.doClearTraces = doClearTraces
     self.w = np.zeros([self.nParams], dtype=np.float)
     self.z = np.zeros([self.nParams], dtype=np.float)
     self.policy = FunctionApproximationPolicy(
         self.nParams,
         self.nActions,
         self.af_kwargs,
         actionSelectionMethod=actionSelectionMethod,
         epsilon=epsilon)
Example #5
0
def dLogSoftmaxLinear(w, state, action=None, **kwargs):
    ftf = getValueFromDict(kwargs, "ftf")
    nActions = getValueFromDict(kwargs, "nActions")
    features = np.array([ftf(state, a, **kwargs) for a in range(nActions)],
                        dtype=float)
    p = softmaxLinear(w, state, **kwargs)
    expectation = np.dot(p, features)
    return features[action] - expectation
 def __init__(self, nParams, alpha, gamma, approximationFunctionArgs):
     self.name = "Semi-gradient TD Prediction"
     self.nParams = nParams
     self.alpha = alpha
     self.gamma = gamma
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.w = np.zeros([self.nParams], dtype=float)
 def __init__(self, nParams, gamma, epsilon, approximationFunctionArgs):
     self.name = "LSTD"
     self.nParams = nParams
     self.gamma = gamma
     self.epsilon = epsilon
     self.af_kwargs = approximationFunctionArgs
     self.ftf = getValueFromDict(self.af_kwargs, "ftf")
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.invA = np.eye(self.nParams) * self.epsilon
     self.b = np.zeros([self.nParams, 1])
     self.w = np.zeros([self.nParams], dtype=float)
 def __init__(self, nParams, alpha, gamma, lambd,
              approximationFunctionArgs):
     self.name = "Semi-Gradient TD(Lambda)"
     self.nParams = nParams
     self.alpha = alpha
     self.gamma = gamma
     self.lambd = lambd
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.w = np.zeros([self.nParams], dtype=np.float)
     self.z = np.zeros([self.nParams], dtype=np.float)
 def __init__(self, nParams, alpha, gamma, lambd,
              approximationFunctionArgs):
     self.name = "Online Lambda Return"
     self.nParams = nParams
     self.alpha = alpha
     self.gamma = gamma
     self.lambd = lambd
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.w = np.zeros([self.nParams], dtype=np.float)
     self.bufferExperience = []
 def __init__(self, nStates, nParams, alpha, gamma,
              approximationFunctionArgs):
     self.name = "Gradient Monte Carlo Prediction"
     self.nStates = nStates
     self.nParams = nParams
     self.alpha = alpha
     self.gamma = gamma
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.returns = {}
     self.visitCounts = np.zeros([self.nStates], dtype=int)
     self.w = np.zeros([self.nParams], dtype=float)
Example #11
0
def fourier(state, action=None, **kwargs):
  nParams = getValueFromDict(kwargs, "nParams") 
  stateNormFactor = getValueFromDict(kwargs, "stateNormFactor", 1.0)
  if action is None:
    nActions = 1
    idx_action = 0
  else:
    nActions = getValueFromDict(kwargs, "nActions")
    idx_action = action
  stateFeatureVectorSize = nParams//nActions
  stateFeatureVector = np.array([np.cos(i*np.pi*state*stateNormFactor) for i in range(stateFeatureVectorSize)], dtype=float)
  featureVector = np.zeros(nParams)
  featureVector[idx_action*stateFeatureVectorSize:(idx_action+1)*stateFeatureVectorSize] = stateFeatureVector
  return featureVector
Example #12
0
def radialBasisFunction(state, action=None, **kwargs):
  mu = getValueFromDict(kwargs, "mu")
  sigma = getValueFromDict(kwargs, "sigma")
  if action is None:
    nActions = 1
    idx_action = 0
  else:
    nActions = getValueFromDict(kwargs, "nActions")
    idx_action = action
  stateFeatureVector = np.exp(-((state-mu)**2)/(2*sigma**2))
  stateFeatureVectorSize = len(stateFeatureVector)
  featureVector = np.zeros(stateFeatureVectorSize*nActions)
  featureVector[idx_action*stateFeatureVectorSize:(idx_action+1)*stateFeatureVectorSize] = stateFeatureVector
  return featureVector
 def __init__(self, nParams, alpha, beta, gamma, targetPolicy,
              approximationFunctionArgs):
     self.name = "Gradient TD Prediction"
     self.nParams = nParams
     self.alpha = alpha
     self.beta = beta
     self.gamma = gamma
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.ftf = getValueFromDict(self.af_kwargs, "ftf")
     self.policy = targetPolicy
     self.w = np.zeros([self.nParams], dtype=float)
     self.v = np.zeros([self.nParams], dtype=float)
 def __init__(self, alpha_w, alpha_theta, gamma, nParams_w,
              approximationFunctionArgs, nParams_theta, nActions,
              policyApproximationFunctionArgs):
     self.name = "REINFORCE with Baseline"
     self.alpha_w = alpha_w
     self.alpha_theta = alpha_theta
     self.gamma = gamma
     self.nParams_w = nParams_w
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.w = np.zeros([self.nParams_w], dtype=np.float)
     self.policy = ParametrizedPolicy(nParams_theta, nActions,
                                      policyApproximationFunctionArgs)
     self.bufferExperience = []
 def __init__(self, nStates, nActions, nParams, gamma, alpha,
              thresh_convergence, expectedValueFunction,
              approximationFunctionArgs):
     self.name = "Semi-Gradient Policy Evaluation"
     self.nStates = nStates
     self.nActions = nActions
     self.nParams = nParams
     self.gamma = gamma
     self.alpha = alpha
     self.thresh_convergence = thresh_convergence
     self.computeExpectedValue = expectedValueFunction
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.w = np.zeros([self.nParams], dtype=float)
 def __init__(self, alpha_w, alpha_theta, gamma, nParams_w,
              approximationFunctionArgs, nParams_theta, nActions,
              policyApproximationFunctionArgs):
     self.name = "One step Actor-Critic"
     self.alpha_w = alpha_w
     self.alpha_theta = alpha_theta
     self.gamma = gamma
     self.nParams_w = nParams_w
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.w = np.zeros([self.nParams_w], dtype=np.float)
     self.I = 1.0
     self.policy = ParametrizedPolicy(nParams_theta, nActions,
                                      policyApproximationFunctionArgs)
 def __init__(self, nStates, nActions, nParams, gamma, alpha, beta, reward,
              approximationFunctionArgs):
     self.name = "Expected TDC"
     self.nStates = nStates
     self.nActions = nActions
     self.nParams = nParams
     self.gamma = gamma
     self.alpha = alpha
     self.beta = beta
     self.reward = reward
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.ftf = getValueFromDict(self.af_kwargs, "ftf")
     self.w = np.zeros([self.nParams], dtype=float)
     self.v = np.zeros([self.nParams], dtype=float)
Example #18
0
def calculateProjectionMatrix(nStates, nParams, ftf_args, mu):
  ftf = getValueFromDict(ftf_args, "ftf")
  X = np.zeros((nStates, nParams))
  for s in range(nStates):
    X[s, :] = ftf(s, **ftf_args)
  D = np.diag(mu)
  return X @ np.linalg.pinv(X.T @ D @ X) @ X.T @ D
Example #19
0
def selectAction_egreedy(actionValues, **kwargs):
    argmax_function = getValueFromDict(kwargs, "argmaxfun", np.argmax)
    epsilon = kwargs["epsilon"]
    if np.random.rand() < epsilon:
        action = np.random.randint(0, len(actionValues))
    else:
        action = argmax_function(actionValues)
    return action
Example #20
0
 def __init__(self,
              nParams,
              nActions,
              approximationFunctionArgs,
              weightInit="random"):
     self.nParams = nParams
     self.nActions = nActions
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.weightInit = weightInit
     if self.weightInit == "zeros":
         self.theta = np.zeros([self.nParams], dtype=float)
     elif self.weightInit == "random":
         self.theta = np.random.randn(self.nParams)
     else:
         sys.exit("ERROR: ParametrizedPolicy: weightInit not recognized!")
Example #21
0
def selectAction_UCB(actionValues, **kwargs):
    argmax_function = getValueFromDict(kwargs, "argmaxfun", np.argmax)
    c = kwargs["c"]
    t = kwargs["t"]
    N = kwargs["N"]
    if np.min(N) == 0:
        return np.argmin(N)
    else:
        return argmax_function(actionValues + c * np.sqrt(np.log(t) / N))
 def __init__(self, alpha_w, alpha_theta, alpha_r, lambd_w, lambd_theta,
              nParams_w, approximationFunctionArgs, nParams_theta, nActions,
              policyApproximationFunctionArgs):
     self.name = "Actor-Criticwith Eligibility Traces (average reward)"
     self.alpha_w = alpha_w
     self.alpha_theta = alpha_theta
     self.alpha_r = alpha_r
     self.lambd_w = lambd_w
     self.lambd_theta = lambd_theta
     self.nParams_w = nParams_w
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.w = np.zeros([self.nParams_w], dtype=np.float)
     self.z_w = np.zeros([self.nParams_w], dtype=np.float)
     self.z_theta = np.zeros([self.nParams_theta], dtype=np.float)
     self.avgR = 0.0
     self.policy = ParametrizedPolicy(nParams_theta, nActions,
                                      policyApproximationFunctionArgs)
Example #23
0
def polynomial(state, action=None, **kwargs):
  nParams = getValueFromDict(kwargs, "nParams")
  stateNormFactor = getValueFromDict(kwargs, "stateNormFactor", 1.0)
  c = getValueFromDict(kwargs, "c")
  if action is None:
    nActions = 1
    idx_action = 0
  else:
    nActions = getValueFromDict(kwargs, "nActions")
    idx_action = action
  stateFeatureVectorSize = nParams//nActions
  stateVector = buildStateVector(state*stateNormFactor)
  stateFeatureVector = np.ones(stateFeatureVectorSize)
  for i in range(stateFeatureVectorSize):
    for j in range(len(stateVector)):
      stateFeatureVector[i]*=stateVector[j]**c[i,j]
  featureVector = np.zeros(nParams)
  featureVector[idx_action*stateFeatureVectorSize:(idx_action+1)*stateFeatureVectorSize] = stateFeatureVector
  return featureVector
Example #24
0
def stateAggregation(state, action=None, **kwargs):
  '''
  Simple tile coding with one grid tiling and one state dimension
  '''
  nParams = getValueFromDict(kwargs, "nParams") 
  nStates = getValueFromDict(kwargs, "nStates") 
  if action is None:
    nActions = 1
    idx_action = 0
  else:
    nActions = getValueFromDict(kwargs, "nActions")
    idx_action = action
  stateFeatureVectorSize = nParams//nActions
  stateFeatureVector = np.zeros(nParams, dtype=int)
  mappedIdx = int(mapValues(state, 0, nStates, 0, nParams))
  if mappedIdx>=0 and mappedIdx<nParams:
    stateFeatureVector[mappedIdx] = 1
  featureVector = np.zeros(nParams)
  featureVector[idx_action*stateFeatureVectorSize:(idx_action+1)*stateFeatureVectorSize] = stateFeatureVector
  return featureVector
 def __init__(self,
              nParams,
              nActions,
              alpha,
              approximationFunctionArgs,
              actionSelectionMethod="egreedy",
              epsilon=0.01):
     self.name = "Generic SemiGradient TD Control Class"
     self.nParams = nParams
     self.nActions = nActions
     self.alpha = alpha
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.w = np.zeros([self.nParams], dtype=float)
     self.policy = FunctionApproximationPolicy(
         self.nParams,
         self.nActions,
         self.af_kwargs,
         actionSelectionMethod=actionSelectionMethod,
         epsilon=epsilon)
Example #26
0
def tileCoding(state, action=None, **kwargs):
  '''
  Tile coding with grid tiles
  ''' 
  minStates = np.array(getValueFromDict(kwargs, "minStates"))
  maxStates = np.array(getValueFromDict(kwargs, "maxStates"))
  nTilings = getValueFromDict(kwargs, "nTilings")
  tilingOffsets = np.array(getValueFromDict(kwargs, "tilingOffsets"))
  tilingSize = np.array(getValueFromDict(kwargs, "tilingSize"))
  dimStates = len(minStates)
  if action is None:
    nActions = 1
    idx_action = 0
  else:
    nActions = getValueFromDict(kwargs, "nActions")
    idx_action = action
  stateVector = buildStateVector(state)
  stateFeatureVector = []
  for idx_tiling in range(nTilings):
    tileVector = np.zeros(tilingSize[idx_tiling])
    mappedIdx = mapValues(stateVector+tilingOffsets[idx_tiling], minStates, maxStates, np.zeros(dimStates), tilingSize[idx_tiling])
    mappedIdx = np.array(mappedIdx, dtype=int)
    if min(mappedIdx>=np.zeros_like(mappedIdx))==True and min(mappedIdx<tilingSize[idx_tiling])==True:
      tileVector[tuple(mappedIdx)] = 1
    stateFeatureVector.extend(tileVector.flatten())
  stateFeatureVectorSize = len(stateFeatureVector)
  featureVector = np.zeros(nActions*stateFeatureVectorSize)
  featureVector[idx_action*stateFeatureVectorSize:(idx_action+1)*stateFeatureVectorSize] = stateFeatureVector
  return np.array(featureVector, dtype=int).flatten()
Example #27
0
def selectAction_esoft_(actionValues, **kwargs):
    # TODO: consider implementing this
    epsilon = kwargs["epsilon"]
    argmax_function = getValueFromDict(kwargs, "argmaxfun", np.argmax)
    q_max = np.max(actionValues)
    n_greedy_actions = 0
    greedy_actions = []
    for i in range(len(actionValues)):
        if actionValues[i] == q_max:
            n_greedy_actions += 1
            greedy_actions.append(i)
    non_greedy_action_probability = epsilon / len(actionValues)
    greedy_action_probability = (
        (1.0 - epsilon) / n_greedy_actions) + non_greedy_action_probability
    p = np.zeros(len(actionValues)) + non_greedy_action_probability
    p[greedy_actions] = greedy_action_probability
    return np.random.choice(len(p), p=p)
Example #28
0
def FixedStateEncoding(state, action=None, **kwargs):
  stateEncodingMatrix = getValueFromDict(kwargs, 'stateEncodingMatrix')
  if action is None:
    return stateEncodingMatrix[state,:].T
  else:
    return stateEncodingMatrix[state,action,:].T
Example #29
0
 def computeExpectedValue(self, state, action, w, af_kwargs, gamma):
   af = getValueFromDict(af_kwargs, "af")
   return np.sum( [self.stateTransitionProbs[state, action, next_state] * (self.defaultReward + gamma*af(w, next_state, **af_kwargs)) for next_state in range(self.nStates)] )
Example #30
0
def selectAction_esoft(actionValues, **kwargs):
    argmax_function = getValueFromDict(kwargs, "argmaxfun", np.argmax)
    epsilon = kwargs["epsilon"]
    p = np.zeros_like(actionValues) + epsilon / (len(actionValues) - 1)
    p[argmax_function(actionValues)] = 1.0 - epsilon
    return np.random.choice(len(p), p=p)