Beispiel #1
0
    def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal):

        # compute basis functions
        phi_s = np.zeros((self.n))
        phi_ns = np.zeros((self.n))
        k = self.representation.features_num
        phi_s[:k] = self.representation.phi(s, False)
        phi_s[k:] = self.policy.dlogpi(s, a)
        phi_ns[:k] = self.representation.phi(ns, terminal)

        # update statistics
        self.z *= self.lambda_
        self.z += phi_s

        self.A += np.einsum("i,j",
                            self.z,
                            phi_s - self.discount_factor * phi_ns,
                            out=self.buf_)
        self.b += self.z * r
        if terminal:
            self.z[:] = 0.
        self.steps_between_updates += 1
        self.logger.debug("Statistics updated")

        if self.steps_between_updates > self.min_steps_between_updates:
            A = regularize(self.A)
            param, time = solveLinear(A, self.b)
            #  v = param[:k]  # parameters of the value function representation
            w = param[k:]  # natural gradient estimate

            if self._gradient_sane(
                    w
            ) or self.steps_between_updates > self.max_steps_between_updates:
                # update policy
                self.policy.theta = self.policy.theta + self.learn_rate * w
                self.last_w = w
                self.logger.debug("Policy updated, norm of gradient {}".format(
                    np.linalg.norm(w)))
                # forget statistics
                self.z *= 1. - self.forgetting_rate
                self.A *= 1. - self.forgetting_rate
                self.b *= 1. - self.forgetting_rate
                self.steps_between_updates = 0

        if terminal:
            self.episodeTerminated()
Beispiel #2
0
    def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal):

        # compute basis functions
        phi_s = np.zeros((self.n))
        phi_ns = np.zeros((self.n))
        k = self.representation.features_num
        phi_s[:k] = self.representation.phi(s, False)
        phi_s[k:] = self.policy.dlogpi(s, a)
        phi_ns[:k] = self.representation.phi(ns, terminal)

        # update statistics
        self.z *= self.lambda_
        self.z += phi_s

        self.A += np.einsum("i,j", self.z, phi_s - self.discount_factor * phi_ns,
                            out=self.buf_)
        self.b += self.z * r
        if terminal:
            self.z[:] = 0.
        self.steps_between_updates += 1
        self.logger.debug("Statistics updated")

        if self.steps_between_updates > self.min_steps_between_updates:
            A = regularize(self.A)
            param, time = solveLinear(A, self.b)
            #  v = param[:k]  # parameters of the value function representation
            w = param[k:]  # natural gradient estimate

            if self._gradient_sane(w) or self.steps_between_updates > self.max_steps_between_updates:
                # update policy
                self.policy.theta = self.policy.theta + self.learn_rate * w
                self.last_w = w
                self.logger.debug(
                    "Policy updated, norm of gradient {}".format(np.linalg.norm(w)))
                # forget statistics
                self.z *= 1. - self.forgetting_rate
                self.A *= 1. - self.forgetting_rate
                self.b *= 1. - self.forgetting_rate
                self.steps_between_updates = 0

        if terminal:
            self.episodeTerminated()
    def solveInMatrixFormat(self):
        # while delta_weight_vec > threshold
        #  1. Gather data following an e-greedy policy
        #  2. Calculate A and b estimates
        #  3. calculate new_weight_vec, and delta_weight_vec
        # return policy greedy w.r.t last weight_vec
        self.policy = eGreedy(
            self.representation,
            epsilon=self.epsilon)

        # Number of samples to be used for each policy evaluation phase. L1 in
        # the Geramifard et. al. FTML 2012 paper
        self.samples_num = 1000

        self.start_time = clock()  # Used to track the total time for solving
        samples = 0
        converged = False
        iteration = 0
        while self.hasTime() and not converged:

            #  1. Gather samples following an e-greedy policy
            S, Actions, NS, R, T = self.policy.collectSamples(self.samples_num)
            samples += self.samples_num

            #  2. Calculate A and b estimates
            a_num = self.domain.actions_num
            n = self.representation.features_num
            discount_factor = self.domain.discount_factor

            self.A = np.zeros((n * a_num, n * a_num))
            self.b = np.zeros((n * a_num, 1))
            for i in xrange(self.samples_num):
                phi_s_a = self.representation.phi_sa(
                    S[i], Actions[i, 0]).reshape((-1, 1))
                E_phi_ns_na = self.calculate_expected_phi_ns_na(
                    S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1))
                d = phi_s_a - discount_factor * E_phi_ns_na
                self.A += np.outer(phi_s_a, d.T)
                self.b += phi_s_a * R[i, 0]

            #  3. calculate new_weight_vec, and delta_weight_vec
            new_weight_vec, solve_time = solveLinear(regularize(self.A), self.b)
            iteration += 1
            if solve_time > 1:
                self.logger.info(
                    '#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)' %
                    (iteration, solve_time))
            delta_weight_vec = l_norm(new_weight_vec - self.representation.weight_vec, np.inf)
            converged = delta_weight_vec < self.convergence_threshold
            self.representation.weight_vec = new_weight_vec
            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun()
            self.logger.info(
                '#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f' %
                (iteration, hhmmss(deltaT(self.start_time)), samples, delta_weight_vec, performance_return))
            if self.show:
                self.domain.show(S[-1], Actions[-1], self.representation)

            # store stats
            self.result["samples"].append(samples)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(performance_discounted_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info('Converged!')

        super(TrajectoryBasedPolicyIteration, self).solve()
    def solveInMatrixFormat(self):
        # while delta_weight_vec > threshold
        #  1. Gather data following an e-greedy policy
        #  2. Calculate A and b estimates
        #  3. calculate new_weight_vec, and delta_weight_vec
        # return policy greedy w.r.t last weight_vec
        self.policy = eGreedy(self.representation, epsilon=self.epsilon)

        # Number of samples to be used for each policy evaluation phase. L1 in
        # the Geramifard et. al. FTML 2012 paper
        self.samples_num = 1000

        self.start_time = clock()  # Used to track the total time for solving
        samples = 0
        converged = False
        iteration = 0
        while self.hasTime() and not converged:

            #  1. Gather samples following an e-greedy policy
            S, Actions, NS, R, T = self.collectSamples(self.samples_num)
            samples += self.samples_num

            #  2. Calculate A and b estimates
            a_num = self.domain.actions_num
            n = self.representation.features_num
            discount_factor = self.domain.discount_factor

            self.A = np.zeros((n * a_num, n * a_num))
            self.b = np.zeros((n * a_num, 1))
            for i in xrange(self.samples_num):
                phi_s_a = self.representation.phi_sa(S[i], T[i],
                                                     Actions[i, 0]).reshape(
                                                         (-1, 1))
                E_phi_ns_na = self.calculate_expected_phi_ns_na(
                    S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1))
                d = phi_s_a - discount_factor * E_phi_ns_na
                self.A += np.outer(phi_s_a, d.T)
                self.b += phi_s_a * R[i, 0]

            #  3. calculate new_weight_vec, and delta_weight_vec
            new_weight_vec, solve_time = solveLinear(regularize(self.A),
                                                     self.b)
            iteration += 1
            if solve_time > 1:
                self.logger.info(
                    '#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)' %
                    (iteration, solve_time))
            delta_weight_vec = l_norm(
                new_weight_vec - self.representation.weight_vec, np.inf)
            converged = delta_weight_vec < self.convergence_threshold
            self.representation.weight_vec = new_weight_vec
            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun(
            )
            self.logger.info(
                '#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f'
                % (iteration, hhmmss(deltaT(self.start_time)), samples,
                   delta_weight_vec, performance_return))
            if self.show:
                self.domain.show(S[-1], Actions[-1], self.representation)

            # store stats
            self.result["samples"].append(samples)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(
                self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(
                performance_discounted_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info('Converged!')

        super(TrajectoryBasedPolicyIteration, self).solve()