def __init__(self, alpha, gamma, nParams, nActions,
              policyApproximationFunctionArgs):
     self.name = "REINFORCE"
     self.alpha = alpha
     self.gamma = gamma
     self.policy = ParametrizedPolicy(nParams, nActions,
                                      policyApproximationFunctionArgs)
     self.bufferExperience = []
class REINFORCEwithBaseline:
    def __init__(self, alpha_w, alpha_theta, gamma, nParams_w,
                 approximationFunctionArgs, nParams_theta, nActions,
                 policyApproximationFunctionArgs):
        self.name = "REINFORCE with Baseline"
        self.alpha_w = alpha_w
        self.alpha_theta = alpha_theta
        self.gamma = gamma
        self.nParams_w = nParams_w
        self.af_kwargs = approximationFunctionArgs
        self.af = getValueFromDict(self.af_kwargs, "af")
        self.afd = getValueFromDict(self.af_kwargs, "afd")
        self.w = np.zeros([self.nParams_w], dtype=np.float)
        self.policy = ParametrizedPolicy(nParams_theta, nActions,
                                         policyApproximationFunctionArgs)
        self.bufferExperience = []

    def update(self, episode):
        self.updateBuffer(episode)
        T = len(self.bufferExperience) - 1
        if self.bufferExperience[T]["done"] == True:
            rewards = np.array(
                [self.bufferExperience[i]["reward"] for i in range(1, T + 1)])
            gammas = np.array([self.gamma**i for i in range(0, T)])
            for t in range(T):
                state = self.bufferExperience[t]["state"]
                action = self.bufferExperience[t]["action"]
                G = np.dot(rewards[t:T + 1], gammas[0:T - t])
                td_error = G - self.getValue(state)
                dPolicy = self.policy.grad(state, action)
                self.w += self.alpha_w * td_error * self.afd(
                    self.w, state, **self.af_kwargs)
                self.policy.theta += self.alpha_theta * self.gamma**t * td_error * dPolicy
            self.bufferExperience = []

    def updateBuffer(self, episode):
        self.bufferExperience = episode

    def selectAction(self, state):
        return self.policy.sampleAction(state)

    def getValue(self, state):
        return self.af(self.w, state, **self.af_kwargs)

    def reset(self):
        self.bufferExperience = []
        self.w[:] = 0.0
        self.policy.reset()

    def getName(self):
        return self.name

    def getGreedyAction(self, state, availableActions=None):
        return self.selectAction(state)
class ActorCriticWithEligibilityTracesAvgReward:
    def __init__(self, alpha_w, alpha_theta, alpha_r, lambd_w, lambd_theta,
                 nParams_w, approximationFunctionArgs, nParams_theta, nActions,
                 policyApproximationFunctionArgs):
        self.name = "Actor-Criticwith Eligibility Traces (average reward)"
        self.alpha_w = alpha_w
        self.alpha_theta = alpha_theta
        self.alpha_r = alpha_r
        self.lambd_w = lambd_w
        self.lambd_theta = lambd_theta
        self.nParams_w = nParams_w
        self.af_kwargs = approximationFunctionArgs
        self.af = getValueFromDict(self.af_kwargs, "af")
        self.afd = getValueFromDict(self.af_kwargs, "afd")
        self.w = np.zeros([self.nParams_w], dtype=np.float)
        self.z_w = np.zeros([self.nParams_w], dtype=np.float)
        self.z_theta = np.zeros([self.nParams_theta], dtype=np.float)
        self.avgR = 0.0
        self.policy = ParametrizedPolicy(nParams_theta, nActions,
                                         policyApproximationFunctionArgs)

    def update(self, episode):
        t = len(episode) - 2
        state = episode[t]["state"]
        action = episode[t]["action"]
        reward = episode[t + 1]["reward"]
        next_state = episode[t + 1]["state"]
        td_error = reward - self.avgR + self.getValue(
            next_state) - self.getValue(state)
        self.avgR += self.alpha_r * td_error
        dPolicy = self.policy.grad(state, action)
        self.z_w = self.lambd_w * self.z_w + self.afd(self.w, state)
        self.z_theta = self.lambd_theta * self.z_theta + dPolicy
        self.w += self.alpha_w * td_error * self.z_w
        self.policy.theta += self.alpha_theta * td_error * self.z_theta

    def selectAction(self, state):
        return self.policy.sampleAction(state)

    def getValue(self, state):
        return self.af(self.w, state, **self.af_kwargs)

    def reset(self):
        self.w[:] = 0.0
        self.z_w[:] = 0.0
        self.z_theta[:] = 0.0
        self.avgR = 0.0
        self.policy.reset()

    def getName(self):
        return self.name

    def getGreedyAction(self, state, availableActions=None):
        return self.selectAction(state)
class OneStepActorCritic:
    def __init__(self, alpha_w, alpha_theta, gamma, nParams_w,
                 approximationFunctionArgs, nParams_theta, nActions,
                 policyApproximationFunctionArgs):
        self.name = "One step Actor-Critic"
        self.alpha_w = alpha_w
        self.alpha_theta = alpha_theta
        self.gamma = gamma
        self.nParams_w = nParams_w
        self.af_kwargs = approximationFunctionArgs
        self.af = getValueFromDict(self.af_kwargs, "af")
        self.afd = getValueFromDict(self.af_kwargs, "afd")
        self.w = np.zeros([self.nParams_w], dtype=np.float)
        self.I = 1.0
        self.policy = ParametrizedPolicy(nParams_theta, nActions,
                                         policyApproximationFunctionArgs)

    def update(self, episode):
        t = len(episode) - 2
        state = episode[t]["state"]
        action = episode[t]["action"]
        reward = episode[t + 1]["reward"]
        next_state = episode[t + 1]["state"]
        done = episode[t + 1]["done"]
        if done:
            v_next = 0
        else:
            v_next = self.getValue(next_state)
        td_error = reward + self.gamma * v_next - self.getValue(state)
        dPolicy = self.policy.grad(state, action)
        self.w += self.alpha_w * td_error * self.afd(self.w, state, **
                                                     self.af_kwargs)
        self.policy.theta += self.alpha_theta * self.I * td_error * dPolicy
        if done:
            self.I = 1.0
        else:
            self.I *= self.gamma

    def selectAction(self, state):
        return self.policy.sampleAction(state)

    def getValue(self, state):
        return self.af(self.w, state, **self.af_kwargs)

    def reset(self):
        self.w[:] = 0.0
        self.I = 1.0
        self.policy.reset()

    def getName(self):
        return self.name

    def getGreedyAction(self, state, availableActions=None):
        return self.selectAction(state)
 def __init__(self, alpha_w, alpha_theta, gamma, nParams_w,
              approximationFunctionArgs, nParams_theta, nActions,
              policyApproximationFunctionArgs):
     self.name = "REINFORCE with Baseline"
     self.alpha_w = alpha_w
     self.alpha_theta = alpha_theta
     self.gamma = gamma
     self.nParams_w = nParams_w
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.w = np.zeros([self.nParams_w], dtype=np.float)
     self.policy = ParametrizedPolicy(nParams_theta, nActions,
                                      policyApproximationFunctionArgs)
     self.bufferExperience = []
 def __init__(self, alpha_w, alpha_theta, gamma, nParams_w,
              approximationFunctionArgs, nParams_theta, nActions,
              policyApproximationFunctionArgs):
     self.name = "One step Actor-Critic"
     self.alpha_w = alpha_w
     self.alpha_theta = alpha_theta
     self.gamma = gamma
     self.nParams_w = nParams_w
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.w = np.zeros([self.nParams_w], dtype=np.float)
     self.I = 1.0
     self.policy = ParametrizedPolicy(nParams_theta, nActions,
                                      policyApproximationFunctionArgs)
 def __init__(self, alpha_w, alpha_theta, alpha_r, lambd_w, lambd_theta,
              nParams_w, approximationFunctionArgs, nParams_theta, nActions,
              policyApproximationFunctionArgs):
     self.name = "Actor-Criticwith Eligibility Traces (average reward)"
     self.alpha_w = alpha_w
     self.alpha_theta = alpha_theta
     self.alpha_r = alpha_r
     self.lambd_w = lambd_w
     self.lambd_theta = lambd_theta
     self.nParams_w = nParams_w
     self.af_kwargs = approximationFunctionArgs
     self.af = getValueFromDict(self.af_kwargs, "af")
     self.afd = getValueFromDict(self.af_kwargs, "afd")
     self.w = np.zeros([self.nParams_w], dtype=np.float)
     self.z_w = np.zeros([self.nParams_w], dtype=np.float)
     self.z_theta = np.zeros([self.nParams_theta], dtype=np.float)
     self.avgR = 0.0
     self.policy = ParametrizedPolicy(nParams_theta, nActions,
                                      policyApproximationFunctionArgs)
class REINFORCE:
    def __init__(self, alpha, gamma, nParams, nActions,
                 policyApproximationFunctionArgs):
        self.name = "REINFORCE"
        self.alpha = alpha
        self.gamma = gamma
        self.policy = ParametrizedPolicy(nParams, nActions,
                                         policyApproximationFunctionArgs)
        self.bufferExperience = []

    def update(self, episode):
        self.updateBuffer(episode)
        T = len(self.bufferExperience) - 1
        if self.bufferExperience[T]["done"] == True:
            rewards = np.array(
                [self.bufferExperience[i]["reward"] for i in range(1, T + 1)])
            gammas = np.array([self.gamma**i for i in range(0, T)])
            for t in range(T):
                state = self.bufferExperience[t]["state"]
                action = self.bufferExperience[t]["action"]
                G = np.dot(rewards[t:T + 1], gammas[0:T - t])
                dPolicy = self.policy.grad(state, action)
                self.policy.theta += self.alpha * self.gamma**t * G * dPolicy
            self.bufferExperience = []

    def updateBuffer(self, episode):
        self.bufferExperience = episode

    def selectAction(self, state):
        return self.policy.sampleAction(state)

    def reset(self):
        self.bufferExperience = []
        self.policy.reset()

    def getName(self):
        return self.name

    def getGreedyAction(self, state, availableActions=None):
        return self.selectAction(state)