Ejemplo n.º 1
0
    def get_optimal_policy_func_vi(self) -> Callable[[S], A]:
        mo = self.mdp_rep
        samples_func = mo.sample_states_gen_func
        rew_func = mo.reward_func
        tr_func = mo.transitions_func
        eps = self.tol * 1e4
        iters = 0
        params = deepcopy(self.fa.params)
        while eps >= self.tol:
            samples = samples_func(self.num_samples)
            values = [
                get_expected_action_value(
                    {
                        a: rew_func(s, a) +
                        mo.gamma * sum(p * self.fa.get_func_eval(s1)
                                       for s1, p in tr_func(s, a).items())
                        for a in self.state_action_func(s)
                    }, self.softmax, self.epsilon_func(iters)) for s in samples
            ]
            self.fa.update_params(samples, values)
            new_params = deepcopy(self.fa.params)
            eps = ADP.get_gradient_max(
                [new_params[i] - p for i, p in enumerate(params)])
            params = new_params
            iters += 1

        # noinspection PyShadowingNames
        def deter_func(s: S, rew_func=rew_func, tr_func=tr_func) -> A:
            return max(
                [(a, rew_func(s, a) + sum(p * self.fa.get_func_eval(s1)
                                          for s1, p in tr_func(s, a).items()))
                 for a in self.state_action_func(s)],
                key=itemgetter(1))[0]

        return deter_func
Ejemplo n.º 2
0
    def get_qv_func_dict(self, pol: Optional[Policy]) -> QFDictType:
        control = pol is None
        this_pol = pol if pol is not None else self.get_init_policy()
        sa_dict = self.mdp_rep.state_action_dict
        qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
        episodes = 0
        updates = 0

        while episodes < self.num_episodes:
            et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
            state, action = self.mdp_rep.init_state_action_gen()
            steps = 0
            terminate = False

            while not terminate:
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_dict[state][action]()
                next_action = get_rv_gen_func_single(
                    this_pol.get_state_probabilities(next_state))()
                if self.algorithm == TDAlgorithm.QLearning and control:
                    next_qv = max(qf_dict[next_state][a]
                                  for a in qf_dict[next_state])
                elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
                    # next_qv = sum(this_pol.get_state_action_probability(
                    #     next_state,
                    #     a
                    # ) * qf_dict[next_state][a] for a in qf_dict[next_state])
                    next_qv = get_expected_action_value(
                        qf_dict[next_state], self.softmax,
                        self.epsilon_func(episodes))
                else:
                    next_qv = qf_dict[next_state][next_action]

                delta = reward + self.mdp_rep.gamma * next_qv -\
                    qf_dict[state][action]
                et_dict[state][action] += 1
                alpha = self.learning_rate * (
                    updates / self.learning_rate_decay + 1)**-0.5
                for s, a_set in sa_dict.items():
                    for a in a_set:
                        qf_dict[s][a] += alpha * delta * et_dict[s][a]
                        et_dict[s][a] *= self.gamma_lambda
                updates += 1
                if control:
                    if self.softmax:
                        this_pol.edit_state_action_to_softmax(
                            state, qf_dict[state])
                    else:
                        this_pol.edit_state_action_to_epsilon_greedy(
                            state, qf_dict[state], self.epsilon_func(episodes))
                steps += 1
                terminate = steps >= self.max_steps or \
                    state in self.mdp_rep.terminal_states
                state = next_state
                action = next_action

            episodes += 1

        return qf_dict
Ejemplo n.º 3
0
    def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType:
        control = polf is None
        this_polf = polf if polf is not None else self.get_init_policy_func()
        episodes = 0

        while episodes < self.num_episodes:
            if self.exploring_start:
                state, action = self.mdp_rep.init_state_action_gen()
            else:
                state = self.mdp_rep.init_state_gen()
                action = get_rv_gen_func_single(this_polf(state))()

            # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in
            #        self.mdp_rep.state_action_func(state))))
            # print(self.qvf_fa.params)

            steps = 0
            terminate = False

            while not terminate:
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_func(state, action)
                next_action = get_rv_gen_func_single(this_polf(next_state))()
                if self.algorithm == TDAlgorithm.QLearning and control:
                    next_qv = max(
                        self.qvf_fa.get_func_eval((next_state, a))
                        for a in self.state_action_func(next_state))
                elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
                    # next_qv = sum(this_polf(next_state).get(a, 0.) *
                    #               self.qvf_fa.get_func_eval((next_state, a))
                    #               for a in self.state_action_func(next_state))
                    next_qv = get_expected_action_value(
                        {
                            a: self.qvf_fa.get_func_eval((next_state, a))
                            for a in self.state_action_func(next_state)
                        }, self.softmax, self.epsilon_func(episodes))
                else:
                    next_qv = self.qvf_fa.get_func_eval(
                        (next_state, next_action))

                target = reward + self.mdp_rep.gamma * next_qv
                # TD is online update and so, policy improves at every time step
                self.qvf_fa.update_params([(state, action)], [target])
                if control:
                    this_polf = get_soft_policy_func_from_qf(
                        self.qvf_fa.get_func_eval, self.state_action_func,
                        self.softmax, self.epsilon_func(episodes))
                steps += 1
                terminate = steps >= self.max_steps or \
                    self.mdp_rep.terminal_state_func(state)
                state = next_state
                action = next_action

            episodes += 1

        return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval(
            (st, act))
Ejemplo n.º 4
0
    def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType:
        control = polf is None
        this_polf = polf if polf is not None else self.get_init_policy_func()
        episodes = 0
        updates = 0

        while episodes < self.num_episodes:
            et = np.zeros(self.qvf_fa.num_features)
            if self.exploring_start:
                state, action = self.mdp_rep.init_state_action_gen()
            else:
                state = self.mdp_rep.init_state_gen()
                action = get_rv_gen_func_single(this_polf(state))()
            features = self.qvf_fa.get_feature_vals((state, action))

            # print((episodes, max(self.qvf_fa.get_feature_vals((state, a)).dot(self.qvf_w)
            #                      for a in self.mdp_rep.state_action_func(state))))
            # print(self.qvf_w)

            old_qvf_fa = 0.
            steps = 0
            terminate = False

            while not terminate:
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_func(state, action)
                next_action = get_rv_gen_func_single(this_polf(next_state))()
                next_features = self.qvf_fa.get_feature_vals(
                    (next_state, next_action))
                qvf_fa = features.dot(self.qvf_w)
                if self.algorithm == TDAlgorithm.QLearning and control:
                    next_qvf_fa = max(
                        self.qvf_fa.get_feature_vals((next_state,
                                                      a)).dot(self.qvf_w)
                        for a in self.state_action_func(next_state))
                elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
                    # next_qvf_fa = sum(this_polf(next_state).get(a, 0.) *
                    #               self.qvf_fa.get_feature_vals((next_state, a)).dot(self.qvf_w)
                    #               for a in self.state_action_func(next_state))
                    next_qvf_fa = get_expected_action_value(
                        {
                            a: self.qvf_fa.get_feature_vals(
                                (next_state, a)).dot(self.qvf_w)
                            for a in self.state_action_func(next_state)
                        }, self.softmax, self.epsilon_func(episodes))
                else:
                    next_qvf_fa = next_features.dot(self.qvf_w)

                target = reward + self.mdp_rep.gamma * next_qvf_fa
                delta = target - qvf_fa
                alpha = self.vf_fa.learning_rate * \
                    (updates / self.learning_rate_decay + 1) ** -0.5
                et = et * self.gamma_lambda + features * \
                    (1 - alpha * self.gamma_lambda * et.dot(features))
                self.qvf_w += alpha * (et * (delta + qvf_fa - old_qvf_fa) -
                                       features * (qvf_fa - old_qvf_fa))

                if control and self.batch_size == 0:
                    this_polf = get_soft_policy_func_from_qf(
                        lambda sa: self.qvf_fa.get_feature_vals(sa).dot(
                            self.qvf_w), self.state_action_func, self.softmax,
                        self.epsilon_func(episodes))
                updates += 1
                steps += 1
                terminate = steps >= self.max_steps or \
                    self.mdp_rep.terminal_state_func(state)
                old_qvf_fa = next_qvf_fa
                state = next_state
                action = next_action
                features = next_features

            episodes += 1

            if control and self.batch_size != 0 and\
                    episodes % self.batch_size == 0:
                this_polf = get_soft_policy_func_from_qf(
                    self.qvf_fa.get_func_eval, self.state_action_func,
                    self.softmax, self.epsilon_func(episodes - 1))

        return lambda st: lambda act, st=st: self.qvf_fa.get_feature_vals(
            (st, act)).dot(self.qvf_w)
Ejemplo n.º 5
0
    def get_qv_func_fa(self, polf: Optional[PolicyActDictType]) -> QFType:
        control = polf is None
        this_polf = polf if polf is not None else self.get_init_policy_func()
        episodes = 0

        while episodes < self.num_episodes:
            et = [np.zeros_like(p) for p in self.qvf_fa.params]
            if self.exploring_start:
                state, action = self.mdp_rep.init_state_action_gen()
            else:
                state = self.mdp_rep.init_state_gen()
                action = get_rv_gen_func_single(this_polf(state))()

            # print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in
            #        self.mdp_rep.state_action_func(state))))
            # print(self.qvf_fa.params)

            steps = 0
            terminate = False

            states_actions = []
            targets = []
            while not terminate:
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_func(state, action)
                next_action = get_rv_gen_func_single(this_polf(next_state))()
                if self.algorithm == TDAlgorithm.QLearning and control:
                    next_qv = max(self.qvf_fa.get_func_eval((next_state, a)) for a in
                                  self.state_action_func(next_state))
                elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
                    # next_qv = sum(this_polf(next_state).get(a, 0.) *
                    #               self.qvf_fa.get_func_eval((next_state, a))
                    #               for a in self.state_action_func(next_state))
                    next_qv = get_expected_action_value(
                        {a: self.qvf_fa.get_func_eval((next_state, a)) for a in
                         self.state_action_func(next_state)},
                        self.softmax,
                        self.epsilon_func(episodes)
                    )
                else:
                    next_qv = self.qvf_fa.get_func_eval((next_state, next_action))

                target = reward + self.mdp_rep.gamma * next_qv
                delta = target - self.qvf_fa.get_func_eval((state, action))

                if self.offline:
                    states_actions.append((state, action))
                    targets.append(target)
                else:
                    et = [et[i] * self.gamma_lambda + g for i, g in
                          enumerate(self.qvf_fa.get_sum_objective_gradient(
                              [(state, action)],
                              np.ones(1)
                          )
                          )]
                    self.qvf_fa.update_params_from_gradient(
                        [-e * delta for e in et]
                    )
                if control and self.batch_size == 0:
                    this_polf = get_soft_policy_func_from_qf(
                        self.qvf_fa.get_func_eval,
                        self.state_action_func,
                        self.softmax,
                        self.epsilon_func(episodes)
                    )
                steps += 1
                terminate = steps >= self.max_steps or \
                    self.mdp_rep.terminal_state_func(state)

                state = next_state
                action = next_action

            if self.offline:
                avg_grad = [g / len(states_actions) for g in
                            self.qvf_fa.get_el_tr_sum_loss_gradient(
                                states_actions,
                                targets,
                                self.gamma_lambda
                            )]
                self.qvf_fa.update_params_from_gradient(avg_grad)

            episodes += 1

            if control and self.batch_size != 0 and\
                    episodes % self.batch_size == 0:
                this_polf = get_soft_policy_func_from_qf(
                    self.qvf_fa.get_func_eval,
                    self.state_action_func,
                    self.softmax,
                    self.epsilon_func(episodes - 1)
                )

        return lambda st: lambda act, st=st: self.qvf_fa.get_func_eval((st, act))