def SolveWeight(self): # sample s_id, a_id - N = 1000 pairs N = 10*self.representation.features_num #print "N: %d" %N Q_vec = np.zeros((N,1)) aid_vec = np.random.choice(np.arange(0, self.ActionNum), replace=True, size=(1, N)) sid_vec = np.random.choice(np.arange(0, self.StateNum), replace=True, size=(1, N)) data_s = np.zeros((N, self.representation.state_space_dims)) data_a = np.zeros((N, 1), dtype=np.uint32) f_size = self.representation.features_num * self.representation.actions_num all_phi_s_a = sp.lil_matrix((N, f_size)) all_phi_s = np.zeros((N, self.representation.features_num)) for i in xrange(N): data_s[i, :] = self.representation.stateID2state(sid_vec[0][i]) data_a[i] = aid_vec[0][i] all_phi_s[i, :] = self.representation.phi(data_s[i],False) Q_vec[i][0] = self.Q[sid_vec[0][i]][aid_vec[0][i]] all_phi_s_a = self.representation.batchPhi_s_a(all_phi_s[:N, :], data_a[:N, :], False) A = np.dot(all_phi_s_a.T,all_phi_s_a) b = np.dot(all_phi_s_a.T,Q_vec) A = Tools.regularize(A) new_weight_vec, solve_time = Tools.solveLinear(A, b) weight_diff = np.linalg.norm(self.representation.weight_vec - new_weight_vec) if weight_diff > self.tol_epsilon: self.representation.weight_vec = new_weight_vec.copy()
def LSTD(self): """Run the LSTD algorithm on the collected data, and update the policy parameters. """ start_time = Tools.clock() if not self.fixedRep: # build phi_s and phi_ns for all samples p = self.samples_count n = self.representation.features_num self.all_phi_s = np.empty((p, n), dtype=self.representation.featureType()) self.all_phi_ns = np.empty((p, n), dtype=self.representation.featureType()) for i in np.arange(self.samples_count): self.all_phi_s[i, :] = self.representation.phi(self.data_s[i]) self.all_phi_ns[i, :] = self.representation.phi( self.data_ns[i]) # build phi_s_a and phi_ns_na for all samples given phi_s and # phi_ns self.all_phi_s_a = self.representation.batchPhi_s_a( self.all_phi_s[:self.samples_count, :], self.data_a[:self.samples_count, :], use_sparse=self.use_sparse) self.all_phi_ns_na = self.representation.batchPhi_s_a( self.all_phi_ns[:self.samples_count, :], self.data_na[:self.samples_count, :], use_sparse=self.use_sparse) # calculate A and b for LSTD F1 = self.all_phi_s_a[:self.samples_count, :] F2 = self.all_phi_ns_na[:self.samples_count, :] R = self.data_r[:self.samples_count, :] discount_factor = self.discount_factor if self.use_sparse: self.b = (F1.T * R).reshape(-1, 1) self.A = F1.T * (F1 - discount_factor * F2) else: self.b = np.dot(F1.T, R).reshape(-1, 1) self.A = np.dot(F1.T, F1 - discount_factor * F2) A = Tools.regularize(self.A) # Calculate weight_vec self.representation.weight_vec, solve_time = Tools.solveLinear( A, self.b) # log solve time only if takes more than 1 second if solve_time > 1: self.logger.info( 'Total LSTD Time = %0.0f(s), Solve Time = %0.0f(s)' % (Tools.deltaT(start_time), solve_time)) else: self.logger.info('Total LSTD Time = %0.0f(s)' % (Tools.deltaT(start_time)))
def LSTD(self): """Run the LSTD algorithm on the collected data, and update the policy parameters. """ start_time = Tools.clock() if not self.fixedRep: # build phi_s and phi_ns for all samples p = self.samples_count n = self.representation.features_num self.all_phi_s = np.empty( (p, n), dtype=self.representation.featureType()) self.all_phi_ns = np.empty( (p, n), dtype=self.representation.featureType()) for i in np.arange(self.samples_count): self.all_phi_s[i, :] = self.representation.phi(self.data_s[i]) self.all_phi_ns[i, :] = self.representation.phi(self.data_ns[i]) # build phi_s_a and phi_ns_na for all samples given phi_s and # phi_ns self.all_phi_s_a = self.representation.batchPhi_s_a(self.all_phi_s[:self.samples_count, :], self.data_a[:self.samples_count,:], use_sparse=self.use_sparse) self.all_phi_ns_na = self.representation.batchPhi_s_a(self.all_phi_ns[:self.samples_count, :], self.data_na[:self.samples_count,:], use_sparse=self.use_sparse) # calculate A and b for LSTD F1 = self.all_phi_s_a[:self.samples_count, :] F2 = self.all_phi_ns_na[:self.samples_count, :] R = self.data_r[:self.samples_count, :] discount_factor = self.discount_factor if self.use_sparse: self.b = (F1.T * R).reshape(-1, 1) self.A = F1.T * (F1 - discount_factor * F2) else: self.b = np.dot(F1.T, R).reshape(-1, 1) self.A = np.dot(F1.T, F1 - discount_factor * F2) A = Tools.regularize(self.A) # Calculate weight_vec self.representation.weight_vec, solve_time = Tools.solveLinear(A, self.b) # log solve time only if takes more than 1 second if solve_time > 1: self.logger.info( 'Total LSTD Time = %0.0f(s), Solve Time = %0.0f(s)' % (Tools.deltaT(start_time), solve_time)) else: self.logger.info( 'Total LSTD Time = %0.0f(s)' % (Tools.deltaT(start_time)))
def policyIteration(self): """Update the policy by recalculating A based on new na. Returns the TD error for each sample based on the latest weights and next actions. """ start_time = Tools.clock() weight_diff = self.tol_epsilon + 1 # So that the loop starts lspi_iteration = 0 self.best_performance = -np.inf self.logger.info('Running Policy Iteration:') # We save action_mask on the first iteration (used for batchBestAction) to reuse it and boost the speed # action_mask is a matrix that shows which actions are available for # each state action_mask = None discount_factor = self.discount_factor F1 = sp.csr_matrix(self.all_phi_s_a[:self.samples_count, :]) if self.use_sparse else self.all_phi_s_a[:self.samples_count,:] while lspi_iteration < self.lspi_iterations and weight_diff > self.tol_epsilon: # Find the best action for each state given the current value function # Notice if actions have the same value the first action is # selected in the batch mode iteration_start_time = Tools.clock() bestAction, self.all_phi_ns_new_na, action_mask = self.representation.batchBestAction(self.data_ns[:self.samples_count, :], self.all_phi_ns, action_mask, self.use_sparse) # Recalculate A matrix (b remains the same) # Solve for the new weight_vec if self.use_sparse: F2 = sp.csr_matrix(self.all_phi_ns_new_na[:self.samples_count, :]) A = F1.T * (F1 - discount_factor * F2) else: F2 = self.all_phi_ns_new_na[:self.samples_count, :] A = np.dot(F1.T, F1 - discount_factor * F2) A = Tools.regularize(A) new_weight_vec, solve_time = Tools.solveLinear(A, self.b) # Calculate TD_Errors #################### td_errors = self.calculateTDErrors() # Calculate the weight difference. If it is big enough update the # weight_vec weight_diff = np.linalg.norm(self.representation.weight_vec - new_weight_vec) if weight_diff > self.tol_epsilon: self.representation.weight_vec = new_weight_vec self.logger.info( "%d: %0.0f(s), ||w1-w2|| = %0.4f, Sparsity=%0.1f%%, %d Features" % (lspi_iteration + 1, Tools.deltaT( iteration_start_time), weight_diff, Tools.sparsity( A), self.representation.features_num)) lspi_iteration += 1 self.logger.info( 'Total Policy Iteration Time = %0.0f(s)' % Tools.deltaT(start_time)) return td_errors
def policyIteration(self): """Update the policy by recalculating A based on new na. Returns the TD error for each sample based on the latest weights and next actions. """ start_time = Tools.clock() weight_diff = self.tol_epsilon + 1 # So that the loop starts lspi_iteration = 0 self.best_performance = -np.inf self.logger.info('Running Policy Iteration:') # We save action_mask on the first iteration (used for batchBestAction) to reuse it and boost the speed # action_mask is a matrix that shows which actions are available for # each state action_mask = None discount_factor = self.discount_factor F1 = sp.csr_matrix( self.all_phi_s_a[:self.samples_count, :] ) if self.use_sparse else self.all_phi_s_a[:self.samples_count, :] while lspi_iteration < self.lspi_iterations and weight_diff > self.tol_epsilon: # Find the best action for each state given the current value function # Notice if actions have the same value the first action is # selected in the batch mode iteration_start_time = Tools.clock() bestAction, self.all_phi_ns_new_na, action_mask = self.representation.batchBestAction( self.data_ns[:self.samples_count, :], self.all_phi_ns, action_mask, self.use_sparse) # Recalculate A matrix (b remains the same) # Solve for the new weight_vec if self.use_sparse: F2 = sp.csr_matrix( self.all_phi_ns_new_na[:self.samples_count, :]) A = F1.T * (F1 - discount_factor * F2) else: F2 = self.all_phi_ns_new_na[:self.samples_count, :] A = np.dot(F1.T, F1 - discount_factor * F2) A = Tools.regularize(A) new_weight_vec, solve_time = Tools.solveLinear(A, self.b) # Calculate TD_Errors #################### td_errors = self.calculateTDErrors() # Calculate the weight difference. If it is big enough update the # weight_vec weight_diff = np.linalg.norm(self.representation.weight_vec - new_weight_vec) if weight_diff > self.tol_epsilon: self.representation.weight_vec = new_weight_vec self.logger.info( "%d: %0.0f(s), ||w1-w2|| = %0.4f, Sparsity=%0.1f%%, %d Features" % (lspi_iteration + 1, Tools.deltaT(iteration_start_time), weight_diff, Tools.sparsity(A), self.representation.features_num)) lspi_iteration += 1 self.logger.info('Total Policy Iteration Time = %0.0f(s)' % Tools.deltaT(start_time)) return td_errors