Example #1
0
    def LSTD(self):
        """Run the LSTD algorithm on the collected data, and update the
        policy parameters.
        """
        start_time = Tools.clock()

        if not self.fixedRep:
            # build phi_s and phi_ns for all samples
            p = self.samples_count
            n = self.representation.features_num
            self.all_phi_s = np.empty((p, n),
                                      dtype=self.representation.featureType())
            self.all_phi_ns = np.empty((p, n),
                                       dtype=self.representation.featureType())

            for i in np.arange(self.samples_count):
                self.all_phi_s[i, :] = self.representation.phi(self.data_s[i])
                self.all_phi_ns[i, :] = self.representation.phi(
                    self.data_ns[i])

            # build phi_s_a and phi_ns_na for all samples given phi_s and
            # phi_ns
            self.all_phi_s_a = self.representation.batchPhi_s_a(
                self.all_phi_s[:self.samples_count, :],
                self.data_a[:self.samples_count, :],
                use_sparse=self.use_sparse)
            self.all_phi_ns_na = self.representation.batchPhi_s_a(
                self.all_phi_ns[:self.samples_count, :],
                self.data_na[:self.samples_count, :],
                use_sparse=self.use_sparse)

            # calculate A and b for LSTD
            F1 = self.all_phi_s_a[:self.samples_count, :]
            F2 = self.all_phi_ns_na[:self.samples_count, :]
            R = self.data_r[:self.samples_count, :]
            discount_factor = self.discount_factor

            if self.use_sparse:
                self.b = (F1.T * R).reshape(-1, 1)
                self.A = F1.T * (F1 - discount_factor * F2)
            else:
                self.b = np.dot(F1.T, R).reshape(-1, 1)
                self.A = np.dot(F1.T, F1 - discount_factor * F2)

        A = Tools.regularize(self.A)

        # Calculate weight_vec
        self.representation.weight_vec, solve_time = Tools.solveLinear(
            A, self.b)

        # log solve time only if takes more than 1 second
        if solve_time > 1:
            self.logger.info(
                'Total LSTD Time = %0.0f(s), Solve Time = %0.0f(s)' %
                (Tools.deltaT(start_time), solve_time))
        else:
            self.logger.info('Total LSTD Time = %0.0f(s)' %
                             (Tools.deltaT(start_time)))
Example #2
0
    def LSTD(self):
        """Run the LSTD algorithm on the collected data, and update the
        policy parameters.
        """
        start_time = Tools.clock()

        if not self.fixedRep:
            # build phi_s and phi_ns for all samples
            p = self.samples_count
            n = self.representation.features_num
            self.all_phi_s = np.empty(
                (p, n), dtype=self.representation.featureType())
            self.all_phi_ns = np.empty(
                (p, n), dtype=self.representation.featureType())

            for i in np.arange(self.samples_count):
                self.all_phi_s[i, :]  = self.representation.phi(self.data_s[i])
                self.all_phi_ns[i, :] = self.representation.phi(self.data_ns[i])

            # build phi_s_a and phi_ns_na for all samples given phi_s and
            # phi_ns
            self.all_phi_s_a     = self.representation.batchPhi_s_a(self.all_phi_s[:self.samples_count, :], self.data_a[:self.samples_count,:], use_sparse=self.use_sparse)
            self.all_phi_ns_na   = self.representation.batchPhi_s_a(self.all_phi_ns[:self.samples_count, :], self.data_na[:self.samples_count,:], use_sparse=self.use_sparse)

            # calculate A and b for LSTD
            F1              = self.all_phi_s_a[:self.samples_count, :]
            F2              = self.all_phi_ns_na[:self.samples_count, :]
            R               = self.data_r[:self.samples_count, :]
            discount_factor = self.discount_factor

            if self.use_sparse:
                self.b = (F1.T * R).reshape(-1, 1)
                self.A = F1.T * (F1 - discount_factor * F2)
            else:
                self.b = np.dot(F1.T, R).reshape(-1, 1)
                self.A = np.dot(F1.T, F1 - discount_factor * F2)

        A = Tools.regularize(self.A)

        # Calculate weight_vec
        self.representation.weight_vec, solve_time = Tools.solveLinear(A, self.b)

        # log solve time only if takes more than 1 second
        if solve_time > 1:
            self.logger.info(
                'Total LSTD Time = %0.0f(s), Solve Time = %0.0f(s)' %
                (Tools.deltaT(start_time), solve_time))
        else:
            self.logger.info(
                'Total LSTD Time = %0.0f(s)' %
                (Tools.deltaT(start_time)))
Example #3
0
    def policyIteration(self):
        """Update the policy by recalculating A based on new na.

        Returns the TD error for each sample based on the latest weights and next actions.
        """
        start_time = Tools.clock()
        weight_diff = self.tol_epsilon + 1  # So that the loop starts
        lspi_iteration = 0
        self.best_performance = -np.inf
        self.logger.info('Running Policy Iteration:')

        # We save action_mask on the first iteration (used for batchBestAction) to reuse it and boost the speed
        # action_mask is a matrix that shows which actions are available for
        # each state
        action_mask = None
        discount_factor = self.discount_factor
        F1 = sp.csr_matrix(self.all_phi_s_a[:self.samples_count, :]) if self.use_sparse else self.all_phi_s_a[:self.samples_count,:]
        while lspi_iteration < self.lspi_iterations and weight_diff > self.tol_epsilon:

            # Find the best action for each state given the current value function
            # Notice if actions have the same value the first action is
            # selected in the batch mode
            iteration_start_time = Tools.clock()
            bestAction, self.all_phi_ns_new_na, action_mask = self.representation.batchBestAction(self.data_ns[:self.samples_count, :], self.all_phi_ns, action_mask, self.use_sparse)

            # Recalculate A matrix (b remains the same)
            # Solve for the new weight_vec
            if self.use_sparse:
                F2  = sp.csr_matrix(self.all_phi_ns_new_na[:self.samples_count, :])
                A = F1.T * (F1 - discount_factor * F2)
            else:
                F2  = self.all_phi_ns_new_na[:self.samples_count, :]
                A = np.dot(F1.T, F1 - discount_factor * F2)

            A = Tools.regularize(A)
            new_weight_vec, solve_time = Tools.solveLinear(A, self.b)

            # Calculate TD_Errors
            ####################
            td_errors = self.calculateTDErrors()

            # Calculate the weight difference. If it is big enough update the
            # weight_vec
            weight_diff = np.linalg.norm(self.representation.weight_vec - new_weight_vec)
            if weight_diff > self.tol_epsilon:
                self.representation.weight_vec = new_weight_vec

            self.logger.info(
                "%d: %0.0f(s), ||w1-w2|| = %0.4f, Sparsity=%0.1f%%, %d Features" % (lspi_iteration + 1,
                                                                                    Tools.deltaT(
                                                                                        iteration_start_time),
                                                                                    weight_diff,
                                                                                    Tools.sparsity(
                                                                                        A),
                                                                                    self.representation.features_num))
            lspi_iteration += 1

        self.logger.info(
            'Total Policy Iteration Time = %0.0f(s)' %
            Tools.deltaT(start_time))
        return td_errors
Example #4
0
    def policyIteration(self):
        """Update the policy by recalculating A based on new na.

        Returns the TD error for each sample based on the latest weights and next actions.
        """
        start_time = Tools.clock()
        weight_diff = self.tol_epsilon + 1  # So that the loop starts
        lspi_iteration = 0
        self.best_performance = -np.inf
        self.logger.info('Running Policy Iteration:')

        # We save action_mask on the first iteration (used for batchBestAction) to reuse it and boost the speed
        # action_mask is a matrix that shows which actions are available for
        # each state
        action_mask = None
        discount_factor = self.discount_factor
        F1 = sp.csr_matrix(
            self.all_phi_s_a[:self.samples_count, :]
        ) if self.use_sparse else self.all_phi_s_a[:self.samples_count, :]
        while lspi_iteration < self.lspi_iterations and weight_diff > self.tol_epsilon:

            # Find the best action for each state given the current value function
            # Notice if actions have the same value the first action is
            # selected in the batch mode
            iteration_start_time = Tools.clock()
            bestAction, self.all_phi_ns_new_na, action_mask = self.representation.batchBestAction(
                self.data_ns[:self.samples_count, :], self.all_phi_ns,
                action_mask, self.use_sparse)

            # Recalculate A matrix (b remains the same)
            # Solve for the new weight_vec
            if self.use_sparse:
                F2 = sp.csr_matrix(
                    self.all_phi_ns_new_na[:self.samples_count, :])
                A = F1.T * (F1 - discount_factor * F2)
            else:
                F2 = self.all_phi_ns_new_na[:self.samples_count, :]
                A = np.dot(F1.T, F1 - discount_factor * F2)

            A = Tools.regularize(A)
            new_weight_vec, solve_time = Tools.solveLinear(A, self.b)

            # Calculate TD_Errors
            ####################
            td_errors = self.calculateTDErrors()

            # Calculate the weight difference. If it is big enough update the
            # weight_vec
            weight_diff = np.linalg.norm(self.representation.weight_vec -
                                         new_weight_vec)
            if weight_diff > self.tol_epsilon:
                self.representation.weight_vec = new_weight_vec

            self.logger.info(
                "%d: %0.0f(s), ||w1-w2|| = %0.4f, Sparsity=%0.1f%%, %d Features"
                % (lspi_iteration + 1,
                   Tools.deltaT(iteration_start_time), weight_diff,
                   Tools.sparsity(A), self.representation.features_num))
            lspi_iteration += 1

        self.logger.info('Total Policy Iteration Time = %0.0f(s)' %
                         Tools.deltaT(start_time))
        return td_errors