def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType: control = polf is None this_polf = polf if polf is not None else self.get_init_policy_func() episodes = 0 while episodes < self.num_episodes: start_state, start_action = self.mdp_rep.init_state_action_gen() mc_path = self.get_mc_path(this_polf, start_state, start_action) rew_arr = np.array([x for _, _, x, _ in mc_path]) if self.mdp_rep.terminal_state_func(mc_path[-1][0]): returns = get_returns_from_rewards_terminating( rew_arr, self.mdp_rep.gamma) else: returns = get_returns_from_rewards_non_terminating( rew_arr, self.mdp_rep.gamma, self.nt_return_eval_steps) sgd_pts = [((mc_path[i][0], mc_path[i][1]), r) for i, r in enumerate(returns) if not self.first_visit or mc_path[i][3]] # MC is offline update and so, policy improves after each episode self.qvf_fa.update_params(*zip(*sgd_pts)) if control: this_polf = get_soft_policy_func_from_qf( self.qvf_fa.get_func_eval, self.state_action_func, self.softmax, self.epsilon_func(episodes)) episodes += 1 return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval( (st, act))
def get_optimal_policy_func_pi(self) -> Callable[[S], A]: this_polf = self.get_init_policy_func() eps = self.tol * 1e4 iters = 0 params = deepcopy(self.fa.params) while eps >= self.tol: self.get_value_func_fa(this_polf, True) qvf = self.get_act_value_func_fa(this_polf, False) def q_func(sa: Tuple[S, A], qvf=qvf) -> float: return qvf(sa[0])(sa[1]) this_polf = get_soft_policy_func_from_qf( qf=q_func, state_action_func=self.state_action_func, softmax=self.softmax, epsilon=self.epsilon_func(iters) ) new_params = deepcopy(self.fa.params) eps = ADP.get_gradient_max( [new_params[i] - p for i, p in enumerate(params)] ) params = new_params iters += 1 # noinspection PyShadowingNames def det_pol(s: S, this_polf=this_polf) -> A: return max(this_polf(s).items(), key=itemgetter(1))[0] return det_pol
def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType: ffs = self.qvf_fa.feature_funcs features = len(ffs) a_mat = np.zeros((features, features)) b_vec = np.zeros(features) control = polf is None this_polf = polf if polf is not None else self.get_init_policy_func() for episode in range(self.num_episodes): if self.exploring_start: state, action = self.mdp_rep.init_state_action_gen() else: state = self.mdp_rep.init_state_gen() action = get_rv_gen_func_single(this_polf(state))() # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in # self.mdp_rep.state_action_func(state)))) # print(self.qvf_fa.params) steps = 0 terminate = False while not terminate: next_state, reward = \ self.mdp_rep.state_reward_gen_func(state, action) phi_s = np.array([f((state, action)) for f in ffs]) next_action = get_rv_gen_func_single(this_polf(next_state))() if control: next_act = max( [(a, self.qvf_fa.get_func_eval((next_state, a))) for a in self.state_action_func(next_state)], key=itemgetter(1))[0] else: next_act = next_action phi_sp = np.array([f((next_state, next_act)) for f in ffs]) a_mat += np.outer(phi_s, phi_s - self.mdp_rep.gamma * phi_sp) b_vec += reward * phi_s steps += 1 terminate = steps >= self.max_steps or \ self.mdp_rep.terminal_state_func(state) state = next_state action = next_action if control and (episode + 1) % self.batch_size == 0: self.qvf_fa.params = [np.linalg.inv(a_mat).dot(b_vec)] # print(self.qvf_fa.params) this_polf = get_soft_policy_func_from_qf( self.qvf_fa.get_func_eval, self.state_action_func, self.softmax, self.epsilon_func(episode)) a_mat = np.zeros((features, features)) b_vec = np.zeros(features) if not control: self.qvf_fa.params = [np.linalg.inv(a_mat).dot(b_vec)] return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval( (st, act))
def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType: control = polf is None this_polf = polf if polf is not None else self.get_init_policy_func() episodes = 0 while episodes < self.num_episodes: if self.exploring_start: state, action = self.mdp_rep.init_state_action_gen() else: state = self.mdp_rep.init_state_gen() action = get_rv_gen_func_single(this_polf(state))() # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in # self.mdp_rep.state_action_func(state)))) # print(self.qvf_fa.params) steps = 0 terminate = False while not terminate: next_state, reward = \ self.mdp_rep.state_reward_gen_func(state, action) next_action = get_rv_gen_func_single(this_polf(next_state))() if self.algorithm == TDAlgorithm.QLearning and control: next_qv = max( self.qvf_fa.get_func_eval((next_state, a)) for a in self.state_action_func(next_state)) elif self.algorithm == TDAlgorithm.ExpectedSARSA and control: # next_qv = sum(this_polf(next_state).get(a, 0.) * # self.qvf_fa.get_func_eval((next_state, a)) # for a in self.state_action_func(next_state)) next_qv = get_expected_action_value( { a: self.qvf_fa.get_func_eval((next_state, a)) for a in self.state_action_func(next_state) }, self.softmax, self.epsilon_func(episodes)) else: next_qv = self.qvf_fa.get_func_eval( (next_state, next_action)) target = reward + self.mdp_rep.gamma * next_qv # TD is online update and so, policy improves at every time step self.qvf_fa.update_params([(state, action)], [target]) if control: this_polf = get_soft_policy_func_from_qf( self.qvf_fa.get_func_eval, self.state_action_func, self.softmax, self.epsilon_func(episodes)) steps += 1 terminate = steps >= self.max_steps or \ self.mdp_rep.terminal_state_func(state) state = next_state action = next_action episodes += 1 return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval( (st, act))
def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType: control = polf is None this_polf = polf if polf is not None else self.get_init_policy_func() episodes = 0 updates = 0 while episodes < self.num_episodes: et = np.zeros(self.qvf_fa.num_features) if self.exploring_start: state, action = self.mdp_rep.init_state_action_gen() else: state = self.mdp_rep.init_state_gen() action = get_rv_gen_func_single(this_polf(state))() features = self.qvf_fa.get_feature_vals((state, action)) # print((episodes, max(self.qvf_fa.get_feature_vals((state, a)).dot(self.qvf_w) # for a in self.mdp_rep.state_action_func(state)))) # print(self.qvf_w) old_qvf_fa = 0. steps = 0 terminate = False while not terminate: next_state, reward = \ self.mdp_rep.state_reward_gen_func(state, action) next_action = get_rv_gen_func_single(this_polf(next_state))() next_features = self.qvf_fa.get_feature_vals( (next_state, next_action)) qvf_fa = features.dot(self.qvf_w) if self.algorithm == TDAlgorithm.QLearning and control: next_qvf_fa = max( self.qvf_fa.get_feature_vals((next_state, a)).dot(self.qvf_w) for a in self.state_action_func(next_state)) elif self.algorithm == TDAlgorithm.ExpectedSARSA and control: # next_qvf_fa = sum(this_polf(next_state).get(a, 0.) * # self.qvf_fa.get_feature_vals((next_state, a)).dot(self.qvf_w) # for a in self.state_action_func(next_state)) next_qvf_fa = get_expected_action_value( { a: self.qvf_fa.get_feature_vals( (next_state, a)).dot(self.qvf_w) for a in self.state_action_func(next_state) }, self.softmax, self.epsilon_func(episodes)) else: next_qvf_fa = next_features.dot(self.qvf_w) target = reward + self.mdp_rep.gamma * next_qvf_fa delta = target - qvf_fa alpha = self.vf_fa.learning_rate * \ (updates / self.learning_rate_decay + 1) ** -0.5 et = et * self.gamma_lambda + features * \ (1 - alpha * self.gamma_lambda * et.dot(features)) self.qvf_w += alpha * (et * (delta + qvf_fa - old_qvf_fa) - features * (qvf_fa - old_qvf_fa)) if control and self.batch_size == 0: this_polf = get_soft_policy_func_from_qf( lambda sa: self.qvf_fa.get_feature_vals(sa).dot( self.qvf_w), self.state_action_func, self.softmax, self.epsilon_func(episodes)) updates += 1 steps += 1 terminate = steps >= self.max_steps or \ self.mdp_rep.terminal_state_func(state) old_qvf_fa = next_qvf_fa state = next_state action = next_action features = next_features episodes += 1 if control and self.batch_size != 0 and\ episodes % self.batch_size == 0: this_polf = get_soft_policy_func_from_qf( self.qvf_fa.get_func_eval, self.state_action_func, self.softmax, self.epsilon_func(episodes - 1)) return lambda st: lambda act, st=st: self.qvf_fa.get_feature_vals( (st, act)).dot(self.qvf_w)
def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType: control = polf is None this_polf = polf if polf is not None else self.get_init_policy_func() episodes = 0 while episodes < self.num_episodes: et = [np.zeros_like(p) for p in self.qvf_fa.params] if self.exploring_start: state, action = self.mdp_rep.init_state_action_gen() else: state = self.mdp_rep.init_state_gen() action = get_rv_gen_func_single(this_polf(state))() # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in # self.mdp_rep.state_action_func(state)))) # print(self.qvf_fa.params) steps = 0 terminate = False states_actions = [] targets = [] while not terminate: next_state, reward = \ self.mdp_rep.state_reward_gen_func(state, action) next_action = get_rv_gen_func_single(this_polf(next_state))() if self.algorithm == TDAlgorithm.QLearning and control: next_qv = max(self.qvf_fa.get_func_eval((next_state, a)) for a in self.state_action_func(next_state)) elif self.algorithm == TDAlgorithm.ExpectedSARSA and control: # next_qv = sum(this_polf(next_state).get(a, 0.) * # self.qvf_fa.get_func_eval((next_state, a)) # for a in self.state_action_func(next_state)) next_qv = get_expected_action_value( {a: self.qvf_fa.get_func_eval((next_state, a)) for a in self.state_action_func(next_state)}, self.softmax, self.epsilon_func(episodes) ) else: next_qv = self.qvf_fa.get_func_eval((next_state, next_action)) target = reward + self.mdp_rep.gamma * next_qv delta = target - self.qvf_fa.get_func_eval((state, action)) if self.offline: states_actions.append((state, action)) targets.append(target) else: et = [et[i] * self.gamma_lambda + g for i, g in enumerate(self.qvf_fa.get_sum_objective_gradient( [(state, action)], np.ones(1) ) )] self.qvf_fa.update_params_from_gradient( [-e * delta for e in et] ) if control and self.batch_size == 0: this_polf = get_soft_policy_func_from_qf( self.qvf_fa.get_func_eval, self.state_action_func, self.softmax, self.epsilon_func(episodes) ) steps += 1 terminate = steps >= self.max_steps or \ self.mdp_rep.terminal_state_func(state) state = next_state action = next_action if self.offline: avg_grad = [g / len(states_actions) for g in self.qvf_fa.get_el_tr_sum_loss_gradient( states_actions, targets, self.gamma_lambda )] self.qvf_fa.update_params_from_gradient(avg_grad) episodes += 1 if control and self.batch_size != 0 and\ episodes % self.batch_size == 0: this_polf = get_soft_policy_func_from_qf( self.qvf_fa.get_func_eval, self.state_action_func, self.softmax, self.epsilon_func(episodes - 1) ) return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval((st, act))