コード例 #1
0
ファイル: LSPI.py プロジェクト: smcgregor/rlpy
    def representationExpansionLSPI(self):
        re_iteration = 0
        added_feature = True

        if self.representation.features_num == 0:
            print "No features, hence no LSPI is necessary!"
            return

        self.logger.info(
            "============================\nRunning LSPI with %d Samples\n============================" %
            self.samples_count)
        while added_feature and re_iteration <= self.re_iterations:
            re_iteration += 1
            # Some Prints
            if Tools.hasFunction(self.representation, 'batchDiscover'):
                self.logger.info(
                    '-----------------\nRepresentation Expansion iteration #%d\n-----------------' %
                    re_iteration)
            # Run LSTD for first solution
            self.LSTD()
            # Run Policy Iteration to change a_prime and recalculate weight_vec in a
            # loop
            td_errors = self.policyIteration()
            # Add new Features
            if Tools.hasFunction(self.representation, 'batchDiscover'):
                added_feature = self.representation.batchDiscover(td_errors, self.all_phi_s[:self.samples_count, :], self.data_s[:self.samples_count,:])
            else:
                added_feature = False
            # print 'L_inf distance to V*= ',
        if added_feature:
            # Run LSPI one last time with the new features
            self.LSTD()
            self.policyIteration()
コード例 #2
0
ファイル: LSPI.py プロジェクト: zhuzhenping/rlpy
    def representationExpansionLSPI(self):
        re_iteration = 0
        added_feature = True

        if self.representation.features_num == 0:
            print("No features, hence no LSPI is necessary!")
            return

        self.logger.info(
            "============================\nRunning LSPI with %d Samples\n============================"
            % self.samples_count)
        while added_feature and re_iteration <= self.re_iterations:
            re_iteration += 1
            # Some Prints
            if Tools.hasFunction(self.representation, 'batchDiscover'):
                self.logger.info(
                    '-----------------\nRepresentation Expansion iteration #%d\n-----------------'
                    % re_iteration)
            # Run LSTD for first solution
            self.LSTD()
            # Run Policy Iteration to change a_prime and recalculate weight_vec in a
            # loop
            td_errors = self.policyIteration()
            # Add new Features
            if Tools.hasFunction(self.representation, 'batchDiscover'):
                added_feature = self.representation.batchDiscover(
                    td_errors, self.all_phi_s[:self.samples_count, :],
                    self.data_s[:self.samples_count, :])
            else:
                added_feature = False
            # print 'L_inf distance to V*= ',
        if added_feature:
            # Run LSPI one last time with the new features
            self.LSTD()
            self.policyIteration()
コード例 #3
0
ファイル: rmax_mcts.py プロジェクト: shengtu/rlpy
    def SolveWeight(self):
        # sample s_id, a_id - N = 1000 pairs
        N = 10*self.representation.features_num 

        #print "N: %d" %N
        Q_vec   = np.zeros((N,1)) 
        aid_vec = np.random.choice(np.arange(0, self.ActionNum), replace=True, size=(1, N))
        sid_vec = np.random.choice(np.arange(0, self.StateNum), replace=True, size=(1, N))
        
        data_s = np.zeros((N, self.representation.state_space_dims))
        data_a = np.zeros((N, 1), dtype=np.uint32)
        f_size = self.representation.features_num * self.representation.actions_num
        all_phi_s_a = sp.lil_matrix((N, f_size))
        all_phi_s = np.zeros((N, self.representation.features_num))
        for i in xrange(N):
            data_s[i, :] = self.representation.stateID2state(sid_vec[0][i])
            data_a[i] = aid_vec[0][i]
            all_phi_s[i, :] = self.representation.phi(data_s[i],False)
            Q_vec[i][0] = self.Q[sid_vec[0][i]][aid_vec[0][i]]

        all_phi_s_a = self.representation.batchPhi_s_a(all_phi_s[:N, :], data_a[:N, :], False)
        A = np.dot(all_phi_s_a.T,all_phi_s_a)
        b = np.dot(all_phi_s_a.T,Q_vec)
        A = Tools.regularize(A)

        new_weight_vec, solve_time = Tools.solveLinear(A, b)

        weight_diff = np.linalg.norm(self.representation.weight_vec - new_weight_vec)
        if weight_diff > self.tol_epsilon:
            self.representation.weight_vec = new_weight_vec.copy()
コード例 #4
0
ファイル: LSPI.py プロジェクト: zhuzhenping/rlpy
    def LSTD(self):
        """Run the LSTD algorithm on the collected data, and update the
        policy parameters.
        """
        start_time = Tools.clock()

        if not self.fixedRep:
            # build phi_s and phi_ns for all samples
            p = self.samples_count
            n = self.representation.features_num
            self.all_phi_s = np.empty((p, n),
                                      dtype=self.representation.featureType())
            self.all_phi_ns = np.empty((p, n),
                                       dtype=self.representation.featureType())

            for i in np.arange(self.samples_count):
                self.all_phi_s[i, :] = self.representation.phi(self.data_s[i])
                self.all_phi_ns[i, :] = self.representation.phi(
                    self.data_ns[i])

            # build phi_s_a and phi_ns_na for all samples given phi_s and
            # phi_ns
            self.all_phi_s_a = self.representation.batchPhi_s_a(
                self.all_phi_s[:self.samples_count, :],
                self.data_a[:self.samples_count, :],
                use_sparse=self.use_sparse)
            self.all_phi_ns_na = self.representation.batchPhi_s_a(
                self.all_phi_ns[:self.samples_count, :],
                self.data_na[:self.samples_count, :],
                use_sparse=self.use_sparse)

            # calculate A and b for LSTD
            F1 = self.all_phi_s_a[:self.samples_count, :]
            F2 = self.all_phi_ns_na[:self.samples_count, :]
            R = self.data_r[:self.samples_count, :]
            discount_factor = self.discount_factor

            if self.use_sparse:
                self.b = (F1.T * R).reshape(-1, 1)
                self.A = F1.T * (F1 - discount_factor * F2)
            else:
                self.b = np.dot(F1.T, R).reshape(-1, 1)
                self.A = np.dot(F1.T, F1 - discount_factor * F2)

        A = Tools.regularize(self.A)

        # Calculate weight_vec
        self.representation.weight_vec, solve_time = Tools.solveLinear(
            A, self.b)

        # log solve time only if takes more than 1 second
        if solve_time > 1:
            self.logger.info(
                'Total LSTD Time = %0.0f(s), Solve Time = %0.0f(s)' %
                (Tools.deltaT(start_time), solve_time))
        else:
            self.logger.info('Total LSTD Time = %0.0f(s)' %
                             (Tools.deltaT(start_time)))
コード例 #5
0
ファイル: LSPI.py プロジェクト: smcgregor/rlpy
    def LSTD(self):
        """Run the LSTD algorithm on the collected data, and update the
        policy parameters.
        """
        start_time = Tools.clock()

        if not self.fixedRep:
            # build phi_s and phi_ns for all samples
            p = self.samples_count
            n = self.representation.features_num
            self.all_phi_s = np.empty(
                (p, n), dtype=self.representation.featureType())
            self.all_phi_ns = np.empty(
                (p, n), dtype=self.representation.featureType())

            for i in np.arange(self.samples_count):
                self.all_phi_s[i, :]  = self.representation.phi(self.data_s[i])
                self.all_phi_ns[i, :] = self.representation.phi(self.data_ns[i])

            # build phi_s_a and phi_ns_na for all samples given phi_s and
            # phi_ns
            self.all_phi_s_a     = self.representation.batchPhi_s_a(self.all_phi_s[:self.samples_count, :], self.data_a[:self.samples_count,:], use_sparse=self.use_sparse)
            self.all_phi_ns_na   = self.representation.batchPhi_s_a(self.all_phi_ns[:self.samples_count, :], self.data_na[:self.samples_count,:], use_sparse=self.use_sparse)

            # calculate A and b for LSTD
            F1              = self.all_phi_s_a[:self.samples_count, :]
            F2              = self.all_phi_ns_na[:self.samples_count, :]
            R               = self.data_r[:self.samples_count, :]
            discount_factor = self.discount_factor

            if self.use_sparse:
                self.b = (F1.T * R).reshape(-1, 1)
                self.A = F1.T * (F1 - discount_factor * F2)
            else:
                self.b = np.dot(F1.T, R).reshape(-1, 1)
                self.A = np.dot(F1.T, F1 - discount_factor * F2)

        A = Tools.regularize(self.A)

        # Calculate weight_vec
        self.representation.weight_vec, solve_time = Tools.solveLinear(A, self.b)

        # log solve time only if takes more than 1 second
        if solve_time > 1:
            self.logger.info(
                'Total LSTD Time = %0.0f(s), Solve Time = %0.0f(s)' %
                (Tools.deltaT(start_time), solve_time))
        else:
            self.logger.info(
                'Total LSTD Time = %0.0f(s)' %
                (Tools.deltaT(start_time)))
コード例 #6
0
ファイル: rmax_mcts.py プロジェクト: shengtu/rlpy
    def learn(self, s, p_actions, a, r, ns, np_actions, na, terminal):

        # The previous state could never be terminal
        # (otherwise the episode would have already terminated)
        prevStateTerminal = False

        # MUST call this at start of learn()
        self.representation.pre_discover(s, prevStateTerminal, a, ns, terminal)

        
        # Compute feature function values and next action to be taken
        discount_factor = self.discount_factor # 'gamma' in literature
        feat_weights    = self.representation.weight_vec # Value function, expressed as feature weights
        features_s      = self.representation.phi(s, prevStateTerminal) # active feats in state
        features        = self.representation.phi_sa(s, prevStateTerminal, a, features_s) # active features or an (s,a) pair
        features_prime_s= self.representation.phi(ns, terminal)
        features_prime  = self.representation.phi_sa(ns, terminal, na, features_prime_s)
        nnz             = Tools.count_nonzero(features_s)  # Number of non-zero elements
        # Compute td-error
        td_error        = r + np.dot(discount_factor * features_prime - features, feat_weights)
        
        ######## Learn a model 
        # and plan on the current learned model: policy iteration (LSPI)
        ## RMax
        self.ModelBasedLearn(s,a,ns,r)

        ###############################
        # MUST call this at end of learn() - add new features to representation as required.
        expanded = self.representation.post_discover(s, False, a, td_error, features_s)

        # MUST call this at end of learn() - handle episode termination cleanup as required.
        if terminal:
            self.episodeTerminated()
コード例 #7
0
ファイル: LSPI.py プロジェクト: smcgregor/rlpy
    def policyIteration(self):
        """Update the policy by recalculating A based on new na.

        Returns the TD error for each sample based on the latest weights and next actions.
        """
        start_time = Tools.clock()
        weight_diff = self.tol_epsilon + 1  # So that the loop starts
        lspi_iteration = 0
        self.best_performance = -np.inf
        self.logger.info('Running Policy Iteration:')

        # We save action_mask on the first iteration (used for batchBestAction) to reuse it and boost the speed
        # action_mask is a matrix that shows which actions are available for
        # each state
        action_mask = None
        discount_factor = self.discount_factor
        F1 = sp.csr_matrix(self.all_phi_s_a[:self.samples_count, :]) if self.use_sparse else self.all_phi_s_a[:self.samples_count,:]
        while lspi_iteration < self.lspi_iterations and weight_diff > self.tol_epsilon:

            # Find the best action for each state given the current value function
            # Notice if actions have the same value the first action is
            # selected in the batch mode
            iteration_start_time = Tools.clock()
            bestAction, self.all_phi_ns_new_na, action_mask = self.representation.batchBestAction(self.data_ns[:self.samples_count, :], self.all_phi_ns, action_mask, self.use_sparse)

            # Recalculate A matrix (b remains the same)
            # Solve for the new weight_vec
            if self.use_sparse:
                F2  = sp.csr_matrix(self.all_phi_ns_new_na[:self.samples_count, :])
                A = F1.T * (F1 - discount_factor * F2)
            else:
                F2  = self.all_phi_ns_new_na[:self.samples_count, :]
                A = np.dot(F1.T, F1 - discount_factor * F2)

            A = Tools.regularize(A)
            new_weight_vec, solve_time = Tools.solveLinear(A, self.b)

            # Calculate TD_Errors
            ####################
            td_errors = self.calculateTDErrors()

            # Calculate the weight difference. If it is big enough update the
            # weight_vec
            weight_diff = np.linalg.norm(self.representation.weight_vec - new_weight_vec)
            if weight_diff > self.tol_epsilon:
                self.representation.weight_vec = new_weight_vec

            self.logger.info(
                "%d: %0.0f(s), ||w1-w2|| = %0.4f, Sparsity=%0.1f%%, %d Features" % (lspi_iteration + 1,
                                                                                    Tools.deltaT(
                                                                                        iteration_start_time),
                                                                                    weight_diff,
                                                                                    Tools.sparsity(
                                                                                        A),
                                                                                    self.representation.features_num))
            lspi_iteration += 1

        self.logger.info(
            'Total Policy Iteration Time = %0.0f(s)' %
            Tools.deltaT(start_time))
        return td_errors
コード例 #8
0
ファイル: LSPI.py プロジェクト: zhuzhenping/rlpy
    def policyIteration(self):
        """Update the policy by recalculating A based on new na.

        Returns the TD error for each sample based on the latest weights and next actions.
        """
        start_time = Tools.clock()
        weight_diff = self.tol_epsilon + 1  # So that the loop starts
        lspi_iteration = 0
        self.best_performance = -np.inf
        self.logger.info('Running Policy Iteration:')

        # We save action_mask on the first iteration (used for batchBestAction) to reuse it and boost the speed
        # action_mask is a matrix that shows which actions are available for
        # each state
        action_mask = None
        discount_factor = self.discount_factor
        F1 = sp.csr_matrix(
            self.all_phi_s_a[:self.samples_count, :]
        ) if self.use_sparse else self.all_phi_s_a[:self.samples_count, :]
        while lspi_iteration < self.lspi_iterations and weight_diff > self.tol_epsilon:

            # Find the best action for each state given the current value function
            # Notice if actions have the same value the first action is
            # selected in the batch mode
            iteration_start_time = Tools.clock()
            bestAction, self.all_phi_ns_new_na, action_mask = self.representation.batchBestAction(
                self.data_ns[:self.samples_count, :], self.all_phi_ns,
                action_mask, self.use_sparse)

            # Recalculate A matrix (b remains the same)
            # Solve for the new weight_vec
            if self.use_sparse:
                F2 = sp.csr_matrix(
                    self.all_phi_ns_new_na[:self.samples_count, :])
                A = F1.T * (F1 - discount_factor * F2)
            else:
                F2 = self.all_phi_ns_new_na[:self.samples_count, :]
                A = np.dot(F1.T, F1 - discount_factor * F2)

            A = Tools.regularize(A)
            new_weight_vec, solve_time = Tools.solveLinear(A, self.b)

            # Calculate TD_Errors
            ####################
            td_errors = self.calculateTDErrors()

            # Calculate the weight difference. If it is big enough update the
            # weight_vec
            weight_diff = np.linalg.norm(self.representation.weight_vec -
                                         new_weight_vec)
            if weight_diff > self.tol_epsilon:
                self.representation.weight_vec = new_weight_vec

            self.logger.info(
                "%d: %0.0f(s), ||w1-w2|| = %0.4f, Sparsity=%0.1f%%, %d Features"
                % (lspi_iteration + 1,
                   Tools.deltaT(iteration_start_time), weight_diff,
                   Tools.sparsity(A), self.representation.features_num))
            lspi_iteration += 1

        self.logger.info('Total Policy Iteration Time = %0.0f(s)' %
                         Tools.deltaT(start_time))
        return td_errors