Python Tools.solveLinearの例、rlpy.Tools.solveLinear Pythonの例

コード例 #1

0

ファイルを表示

ファイル: rmax_mcts.py プロジェクト: shengtu/rlpy

    def SolveWeight(self):
        # sample s_id, a_id - N = 1000 pairs
        N = 10*self.representation.features_num 

        #print "N: %d" %N
        Q_vec   = np.zeros((N,1)) 
        aid_vec = np.random.choice(np.arange(0, self.ActionNum), replace=True, size=(1, N))
        sid_vec = np.random.choice(np.arange(0, self.StateNum), replace=True, size=(1, N))
        
        data_s = np.zeros((N, self.representation.state_space_dims))
        data_a = np.zeros((N, 1), dtype=np.uint32)
        f_size = self.representation.features_num * self.representation.actions_num
        all_phi_s_a = sp.lil_matrix((N, f_size))
        all_phi_s = np.zeros((N, self.representation.features_num))
        for i in xrange(N):
            data_s[i, :] = self.representation.stateID2state(sid_vec[0][i])
            data_a[i] = aid_vec[0][i]
            all_phi_s[i, :] = self.representation.phi(data_s[i],False)
            Q_vec[i][0] = self.Q[sid_vec[0][i]][aid_vec[0][i]]

        all_phi_s_a = self.representation.batchPhi_s_a(all_phi_s[:N, :], data_a[:N, :], False)
        A = np.dot(all_phi_s_a.T,all_phi_s_a)
        b = np.dot(all_phi_s_a.T,Q_vec)
        A = Tools.regularize(A)

        new_weight_vec, solve_time = Tools.solveLinear(A, b)

        weight_diff = np.linalg.norm(self.representation.weight_vec - new_weight_vec)
        if weight_diff > self.tol_epsilon:
            self.representation.weight_vec = new_weight_vec.copy()

コード例 #2

0

ファイルを表示

ファイル: LSPI.py プロジェクト: zhuzhenping/rlpy

    def LSTD(self):
        """Run the LSTD algorithm on the collected data, and update the
        policy parameters.
        """
        start_time = Tools.clock()

        if not self.fixedRep:
            # build phi_s and phi_ns for all samples
            p = self.samples_count
            n = self.representation.features_num
            self.all_phi_s = np.empty((p, n),
                                      dtype=self.representation.featureType())
            self.all_phi_ns = np.empty((p, n),
                                       dtype=self.representation.featureType())

            for i in np.arange(self.samples_count):
                self.all_phi_s[i, :] = self.representation.phi(self.data_s[i])
                self.all_phi_ns[i, :] = self.representation.phi(
                    self.data_ns[i])

            # build phi_s_a and phi_ns_na for all samples given phi_s and
            # phi_ns
            self.all_phi_s_a = self.representation.batchPhi_s_a(
                self.all_phi_s[:self.samples_count, :],
                self.data_a[:self.samples_count, :],
                use_sparse=self.use_sparse)
            self.all_phi_ns_na = self.representation.batchPhi_s_a(
                self.all_phi_ns[:self.samples_count, :],
                self.data_na[:self.samples_count, :],
                use_sparse=self.use_sparse)

            # calculate A and b for LSTD
            F1 = self.all_phi_s_a[:self.samples_count, :]
            F2 = self.all_phi_ns_na[:self.samples_count, :]
            R = self.data_r[:self.samples_count, :]
            discount_factor = self.discount_factor

            if self.use_sparse:
                self.b = (F1.T * R).reshape(-1, 1)
                self.A = F1.T * (F1 - discount_factor * F2)
            else:
                self.b = np.dot(F1.T, R).reshape(-1, 1)
                self.A = np.dot(F1.T, F1 - discount_factor * F2)

        A = Tools.regularize(self.A)

        # Calculate weight_vec
        self.representation.weight_vec, solve_time = Tools.solveLinear(
            A, self.b)

        # log solve time only if takes more than 1 second
        if solve_time > 1:
            self.logger.info(
                'Total LSTD Time = %0.0f(s), Solve Time = %0.0f(s)' %
                (Tools.deltaT(start_time), solve_time))
        else:
            self.logger.info('Total LSTD Time = %0.0f(s)' %
                             (Tools.deltaT(start_time)))

コード例 #3

0

ファイルを表示

ファイル: LSPI.py プロジェクト: smcgregor/rlpy

    def LSTD(self):
        """Run the LSTD algorithm on the collected data, and update the
        policy parameters.
        """
        start_time = Tools.clock()

        if not self.fixedRep:
            # build phi_s and phi_ns for all samples
            p = self.samples_count
            n = self.representation.features_num
            self.all_phi_s = np.empty(
                (p, n), dtype=self.representation.featureType())
            self.all_phi_ns = np.empty(
                (p, n), dtype=self.representation.featureType())

            for i in np.arange(self.samples_count):
                self.all_phi_s[i, :]  = self.representation.phi(self.data_s[i])
                self.all_phi_ns[i, :] = self.representation.phi(self.data_ns[i])

            # build phi_s_a and phi_ns_na for all samples given phi_s and
            # phi_ns
            self.all_phi_s_a     = self.representation.batchPhi_s_a(self.all_phi_s[:self.samples_count, :], self.data_a[:self.samples_count,:], use_sparse=self.use_sparse)
            self.all_phi_ns_na   = self.representation.batchPhi_s_a(self.all_phi_ns[:self.samples_count, :], self.data_na[:self.samples_count,:], use_sparse=self.use_sparse)

            # calculate A and b for LSTD
            F1              = self.all_phi_s_a[:self.samples_count, :]
            F2              = self.all_phi_ns_na[:self.samples_count, :]
            R               = self.data_r[:self.samples_count, :]
            discount_factor = self.discount_factor

            if self.use_sparse:
                self.b = (F1.T * R).reshape(-1, 1)
                self.A = F1.T * (F1 - discount_factor * F2)
            else:
                self.b = np.dot(F1.T, R).reshape(-1, 1)
                self.A = np.dot(F1.T, F1 - discount_factor * F2)

        A = Tools.regularize(self.A)

        # Calculate weight_vec
        self.representation.weight_vec, solve_time = Tools.solveLinear(A, self.b)

        # log solve time only if takes more than 1 second
        if solve_time > 1:
            self.logger.info(
                'Total LSTD Time = %0.0f(s), Solve Time = %0.0f(s)' %
                (Tools.deltaT(start_time), solve_time))
        else:
            self.logger.info(
                'Total LSTD Time = %0.0f(s)' %
                (Tools.deltaT(start_time)))

コード例 #4

0

ファイルを表示

ファイル: LSPI.py プロジェクト: smcgregor/rlpy

    def policyIteration(self):
        """Update the policy by recalculating A based on new na.

        Returns the TD error for each sample based on the latest weights and next actions.
        """
        start_time = Tools.clock()
        weight_diff = self.tol_epsilon + 1  # So that the loop starts
        lspi_iteration = 0
        self.best_performance = -np.inf
        self.logger.info('Running Policy Iteration:')

        # We save action_mask on the first iteration (used for batchBestAction) to reuse it and boost the speed
        # action_mask is a matrix that shows which actions are available for
        # each state
        action_mask = None
        discount_factor = self.discount_factor
        F1 = sp.csr_matrix(self.all_phi_s_a[:self.samples_count, :]) if self.use_sparse else self.all_phi_s_a[:self.samples_count,:]
        while lspi_iteration < self.lspi_iterations and weight_diff > self.tol_epsilon:

            # Find the best action for each state given the current value function
            # Notice if actions have the same value the first action is
            # selected in the batch mode
            iteration_start_time = Tools.clock()
            bestAction, self.all_phi_ns_new_na, action_mask = self.representation.batchBestAction(self.data_ns[:self.samples_count, :], self.all_phi_ns, action_mask, self.use_sparse)

            # Recalculate A matrix (b remains the same)
            # Solve for the new weight_vec
            if self.use_sparse:
                F2  = sp.csr_matrix(self.all_phi_ns_new_na[:self.samples_count, :])
                A = F1.T * (F1 - discount_factor * F2)
            else:
                F2  = self.all_phi_ns_new_na[:self.samples_count, :]
                A = np.dot(F1.T, F1 - discount_factor * F2)

            A = Tools.regularize(A)
            new_weight_vec, solve_time = Tools.solveLinear(A, self.b)

            # Calculate TD_Errors
            ####################
            td_errors = self.calculateTDErrors()

            # Calculate the weight difference. If it is big enough update the
            # weight_vec
            weight_diff = np.linalg.norm(self.representation.weight_vec - new_weight_vec)
            if weight_diff > self.tol_epsilon:
                self.representation.weight_vec = new_weight_vec

            self.logger.info(
                "%d: %0.0f(s), ||w1-w2|| = %0.4f, Sparsity=%0.1f%%, %d Features" % (lspi_iteration + 1,
                                                                                    Tools.deltaT(
                                                                                        iteration_start_time),
                                                                                    weight_diff,
                                                                                    Tools.sparsity(
                                                                                        A),
                                                                                    self.representation.features_num))
            lspi_iteration += 1

        self.logger.info(
            'Total Policy Iteration Time = %0.0f(s)' %
            Tools.deltaT(start_time))
        return td_errors

コード例 #5

0

ファイルを表示

ファイル: LSPI.py プロジェクト: zhuzhenping/rlpy

    def policyIteration(self):
        """Update the policy by recalculating A based on new na.

        Returns the TD error for each sample based on the latest weights and next actions.
        """
        start_time = Tools.clock()
        weight_diff = self.tol_epsilon + 1  # So that the loop starts
        lspi_iteration = 0
        self.best_performance = -np.inf
        self.logger.info('Running Policy Iteration:')

        # We save action_mask on the first iteration (used for batchBestAction) to reuse it and boost the speed
        # action_mask is a matrix that shows which actions are available for
        # each state
        action_mask = None
        discount_factor = self.discount_factor
        F1 = sp.csr_matrix(
            self.all_phi_s_a[:self.samples_count, :]
        ) if self.use_sparse else self.all_phi_s_a[:self.samples_count, :]
        while lspi_iteration < self.lspi_iterations and weight_diff > self.tol_epsilon:

            # Find the best action for each state given the current value function
            # Notice if actions have the same value the first action is
            # selected in the batch mode
            iteration_start_time = Tools.clock()
            bestAction, self.all_phi_ns_new_na, action_mask = self.representation.batchBestAction(
                self.data_ns[:self.samples_count, :], self.all_phi_ns,
                action_mask, self.use_sparse)

            # Recalculate A matrix (b remains the same)
            # Solve for the new weight_vec
            if self.use_sparse:
                F2 = sp.csr_matrix(
                    self.all_phi_ns_new_na[:self.samples_count, :])
                A = F1.T * (F1 - discount_factor * F2)
            else:
                F2 = self.all_phi_ns_new_na[:self.samples_count, :]
                A = np.dot(F1.T, F1 - discount_factor * F2)

            A = Tools.regularize(A)
            new_weight_vec, solve_time = Tools.solveLinear(A, self.b)

            # Calculate TD_Errors
            ####################
            td_errors = self.calculateTDErrors()

            # Calculate the weight difference. If it is big enough update the
            # weight_vec
            weight_diff = np.linalg.norm(self.representation.weight_vec -
                                         new_weight_vec)
            if weight_diff > self.tol_epsilon:
                self.representation.weight_vec = new_weight_vec

            self.logger.info(
                "%d: %0.0f(s), ||w1-w2|| = %0.4f, Sparsity=%0.1f%%, %d Features"
                % (lspi_iteration + 1,
                   Tools.deltaT(iteration_start_time), weight_diff,
                   Tools.sparsity(A), self.representation.features_num))
            lspi_iteration += 1

        self.logger.info('Total Policy Iteration Time = %0.0f(s)' %
                         Tools.deltaT(start_time))
        return td_errors