return np.dot(self.theta, self.sa2x(s, a)) def grad(self, s, a): return self.sa2x(s, a) def getQs(model, s): Qs = {} for a in action_space: q_sa = model.predict(s, a) Qs[a] = q_sa return Qs if __name__ == "__main__": g = standard_neg_grid() model = Model() # Repeat till convergence t = 1.0 t2 = 1.0 deltas = [] for itr in range(20000): if itr % 100 == 0: t += 0.01 t2 += 0.01 if itr % 1000 == 0: print(itr) alpha = Alpha / t2 s = (2, 0)
G = 0 state_action_return = [] first = True for s, a, r in reversed(state_action_reward): if first: first = False else: state_action_return.append((s, a, G)) G = r + gamma * G state_action_return.reverse() return state_action_return if __name__ == "__main__": g = standard_neg_grid(step_cost=-0.1) # Initilize Policy Policy = {} for s in g.actions.keys(): Policy[s] = np.random.choice(action_space) # Initilize State value function Q and returns Q = {} returns = {} states = g.all_states() for s in states: if s in g.actions: Q[s] = {} for a in action_space: Q[s][a] = 0