def get_optimal_policy_func_vi(self) -> Callable[[S], A]: mo = self.mdp_rep samples_func = mo.sample_states_gen_func rew_func = mo.reward_func tr_func = mo.transitions_func eps = self.tol * 1e4 iters = 0 params = deepcopy(self.fa.params) while eps >= self.tol: samples = samples_func(self.num_samples) values = [ get_expected_action_value( { a: rew_func(s, a) + mo.gamma * sum(p * self.fa.get_func_eval(s1) for s1, p in tr_func(s, a).items()) for a in self.state_action_func(s) }, self.softmax, self.epsilon_func(iters)) for s in samples ] self.fa.update_params(samples, values) new_params = deepcopy(self.fa.params) eps = ADP.get_gradient_max( [new_params[i] - p for i, p in enumerate(params)]) params = new_params iters += 1 # noinspection PyShadowingNames def deter_func(s: S, rew_func=rew_func, tr_func=tr_func) -> A: return max( [(a, rew_func(s, a) + sum(p * self.fa.get_func_eval(s1) for s1, p in tr_func(s, a).items())) for a in self.state_action_func(s)], key=itemgetter(1))[0] return deter_func
def get_qv_func_dict(self, pol: Optional[Policy]) -> QFDictType: control = pol is None this_pol = pol if pol is not None else self.get_init_policy() sa_dict = self.mdp_rep.state_action_dict qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()} episodes = 0 updates = 0 while episodes < self.num_episodes: et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()} state, action = self.mdp_rep.init_state_action_gen() steps = 0 terminate = False while not terminate: next_state, reward = \ self.mdp_rep.state_reward_gen_dict[state][action]() next_action = get_rv_gen_func_single( this_pol.get_state_probabilities(next_state))() if self.algorithm == TDAlgorithm.QLearning and control: next_qv = max(qf_dict[next_state][a] for a in qf_dict[next_state]) elif self.algorithm == TDAlgorithm.ExpectedSARSA and control: # next_qv = sum(this_pol.get_state_action_probability( # next_state, # a # ) * qf_dict[next_state][a] for a in qf_dict[next_state]) next_qv = get_expected_action_value( qf_dict[next_state], self.softmax, self.epsilon_func(episodes)) else: next_qv = qf_dict[next_state][next_action] delta = reward + self.mdp_rep.gamma * next_qv -\ qf_dict[state][action] et_dict[state][action] += 1 alpha = self.learning_rate * ( updates / self.learning_rate_decay + 1)**-0.5 for s, a_set in sa_dict.items(): for a in a_set: qf_dict[s][a] += alpha * delta * et_dict[s][a] et_dict[s][a] *= self.gamma_lambda updates += 1 if control: if self.softmax: this_pol.edit_state_action_to_softmax( state, qf_dict[state]) else: this_pol.edit_state_action_to_epsilon_greedy( state, qf_dict[state], self.epsilon_func(episodes)) steps += 1 terminate = steps >= self.max_steps or \ state in self.mdp_rep.terminal_states state = next_state action = next_action episodes += 1 return qf_dict
def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType: control = polf is None this_polf = polf if polf is not None else self.get_init_policy_func() episodes = 0 while episodes < self.num_episodes: if self.exploring_start: state, action = self.mdp_rep.init_state_action_gen() else: state = self.mdp_rep.init_state_gen() action = get_rv_gen_func_single(this_polf(state))() # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in # self.mdp_rep.state_action_func(state)))) # print(self.qvf_fa.params) steps = 0 terminate = False while not terminate: next_state, reward = \ self.mdp_rep.state_reward_gen_func(state, action) next_action = get_rv_gen_func_single(this_polf(next_state))() if self.algorithm == TDAlgorithm.QLearning and control: next_qv = max( self.qvf_fa.get_func_eval((next_state, a)) for a in self.state_action_func(next_state)) elif self.algorithm == TDAlgorithm.ExpectedSARSA and control: # next_qv = sum(this_polf(next_state).get(a, 0.) * # self.qvf_fa.get_func_eval((next_state, a)) # for a in self.state_action_func(next_state)) next_qv = get_expected_action_value( { a: self.qvf_fa.get_func_eval((next_state, a)) for a in self.state_action_func(next_state) }, self.softmax, self.epsilon_func(episodes)) else: next_qv = self.qvf_fa.get_func_eval( (next_state, next_action)) target = reward + self.mdp_rep.gamma * next_qv # TD is online update and so, policy improves at every time step self.qvf_fa.update_params([(state, action)], [target]) if control: this_polf = get_soft_policy_func_from_qf( self.qvf_fa.get_func_eval, self.state_action_func, self.softmax, self.epsilon_func(episodes)) steps += 1 terminate = steps >= self.max_steps or \ self.mdp_rep.terminal_state_func(state) state = next_state action = next_action episodes += 1 return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval( (st, act))
def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType: control = polf is None this_polf = polf if polf is not None else self.get_init_policy_func() episodes = 0 updates = 0 while episodes < self.num_episodes: et = np.zeros(self.qvf_fa.num_features) if self.exploring_start: state, action = self.mdp_rep.init_state_action_gen() else: state = self.mdp_rep.init_state_gen() action = get_rv_gen_func_single(this_polf(state))() features = self.qvf_fa.get_feature_vals((state, action)) # print((episodes, max(self.qvf_fa.get_feature_vals((state, a)).dot(self.qvf_w) # for a in self.mdp_rep.state_action_func(state)))) # print(self.qvf_w) old_qvf_fa = 0. steps = 0 terminate = False while not terminate: next_state, reward = \ self.mdp_rep.state_reward_gen_func(state, action) next_action = get_rv_gen_func_single(this_polf(next_state))() next_features = self.qvf_fa.get_feature_vals( (next_state, next_action)) qvf_fa = features.dot(self.qvf_w) if self.algorithm == TDAlgorithm.QLearning and control: next_qvf_fa = max( self.qvf_fa.get_feature_vals((next_state, a)).dot(self.qvf_w) for a in self.state_action_func(next_state)) elif self.algorithm == TDAlgorithm.ExpectedSARSA and control: # next_qvf_fa = sum(this_polf(next_state).get(a, 0.) * # self.qvf_fa.get_feature_vals((next_state, a)).dot(self.qvf_w) # for a in self.state_action_func(next_state)) next_qvf_fa = get_expected_action_value( { a: self.qvf_fa.get_feature_vals( (next_state, a)).dot(self.qvf_w) for a in self.state_action_func(next_state) }, self.softmax, self.epsilon_func(episodes)) else: next_qvf_fa = next_features.dot(self.qvf_w) target = reward + self.mdp_rep.gamma * next_qvf_fa delta = target - qvf_fa alpha = self.vf_fa.learning_rate * \ (updates / self.learning_rate_decay + 1) ** -0.5 et = et * self.gamma_lambda + features * \ (1 - alpha * self.gamma_lambda * et.dot(features)) self.qvf_w += alpha * (et * (delta + qvf_fa - old_qvf_fa) - features * (qvf_fa - old_qvf_fa)) if control and self.batch_size == 0: this_polf = get_soft_policy_func_from_qf( lambda sa: self.qvf_fa.get_feature_vals(sa).dot( self.qvf_w), self.state_action_func, self.softmax, self.epsilon_func(episodes)) updates += 1 steps += 1 terminate = steps >= self.max_steps or \ self.mdp_rep.terminal_state_func(state) old_qvf_fa = next_qvf_fa state = next_state action = next_action features = next_features episodes += 1 if control and self.batch_size != 0 and\ episodes % self.batch_size == 0: this_polf = get_soft_policy_func_from_qf( self.qvf_fa.get_func_eval, self.state_action_func, self.softmax, self.epsilon_func(episodes - 1)) return lambda st: lambda act, st=st: self.qvf_fa.get_feature_vals( (st, act)).dot(self.qvf_w)
def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType: control = polf is None this_polf = polf if polf is not None else self.get_init_policy_func() episodes = 0 while episodes < self.num_episodes: et = [np.zeros_like(p) for p in self.qvf_fa.params] if self.exploring_start: state, action = self.mdp_rep.init_state_action_gen() else: state = self.mdp_rep.init_state_gen() action = get_rv_gen_func_single(this_polf(state))() # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in # self.mdp_rep.state_action_func(state)))) # print(self.qvf_fa.params) steps = 0 terminate = False states_actions = [] targets = [] while not terminate: next_state, reward = \ self.mdp_rep.state_reward_gen_func(state, action) next_action = get_rv_gen_func_single(this_polf(next_state))() if self.algorithm == TDAlgorithm.QLearning and control: next_qv = max(self.qvf_fa.get_func_eval((next_state, a)) for a in self.state_action_func(next_state)) elif self.algorithm == TDAlgorithm.ExpectedSARSA and control: # next_qv = sum(this_polf(next_state).get(a, 0.) * # self.qvf_fa.get_func_eval((next_state, a)) # for a in self.state_action_func(next_state)) next_qv = get_expected_action_value( {a: self.qvf_fa.get_func_eval((next_state, a)) for a in self.state_action_func(next_state)}, self.softmax, self.epsilon_func(episodes) ) else: next_qv = self.qvf_fa.get_func_eval((next_state, next_action)) target = reward + self.mdp_rep.gamma * next_qv delta = target - self.qvf_fa.get_func_eval((state, action)) if self.offline: states_actions.append((state, action)) targets.append(target) else: et = [et[i] * self.gamma_lambda + g for i, g in enumerate(self.qvf_fa.get_sum_objective_gradient( [(state, action)], np.ones(1) ) )] self.qvf_fa.update_params_from_gradient( [-e * delta for e in et] ) if control and self.batch_size == 0: this_polf = get_soft_policy_func_from_qf( self.qvf_fa.get_func_eval, self.state_action_func, self.softmax, self.epsilon_func(episodes) ) steps += 1 terminate = steps >= self.max_steps or \ self.mdp_rep.terminal_state_func(state) state = next_state action = next_action if self.offline: avg_grad = [g / len(states_actions) for g in self.qvf_fa.get_el_tr_sum_loss_gradient( states_actions, targets, self.gamma_lambda )] self.qvf_fa.update_params_from_gradient(avg_grad) episodes += 1 if control and self.batch_size != 0 and\ episodes % self.batch_size == 0: this_polf = get_soft_policy_func_from_qf( self.qvf_fa.get_func_eval, self.state_action_func, self.softmax, self.epsilon_func(episodes - 1) ) return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval((st, act))