return np.dot(self.theta, self.sa2x(s, a))

    def grad(self, s, a):
        return self.sa2x(s, a)


def getQs(model, s):
    Qs = {}
    for a in action_space:
        q_sa = model.predict(s, a)
        Qs[a] = q_sa
    return Qs


if __name__ == "__main__":
    g = standard_neg_grid()

    model = Model()
    # Repeat till convergence
    t = 1.0
    t2 = 1.0
    deltas = []
    for itr in range(20000):
        if itr % 100 == 0:
            t += 0.01
            t2 += 0.01
        if itr % 1000 == 0:
            print(itr)

        alpha = Alpha / t2
        s = (2, 0)
    G = 0
    state_action_return = []
    first = True
    for s, a, r in reversed(state_action_reward):
        if first:
            first = False
        else:
            state_action_return.append((s, a, G))
        G = r + gamma * G
    state_action_return.reverse()
    return state_action_return


if __name__ == "__main__":
    g = standard_neg_grid(step_cost=-0.1)

    # Initilize Policy
    Policy = {}
    for s in g.actions.keys():
        Policy[s] = np.random.choice(action_space)

    # Initilize State value function Q and returns
    Q = {}
    returns = {}
    states = g.all_states()
    for s in states:
        if s in g.actions:
            Q[s] = {}
            for a in action_space:
                Q[s][a] = 0