def LSTD(self): """Run the LSTD algorithm on the collected data, and update the policy parameters. """ start_time = tools.clock() if not self.fixed_rep: # build phi_s and phi_ns for all samples p = self.samples_count n = self.representation.features_num self.all_phi_s = np.empty((p, n), dtype=self.representation.feature_type()) self.all_phi_ns = np.empty( (p, n), dtype=self.representation.feature_type()) for i in np.arange(self.samples_count): self.all_phi_s[i, :] = self.representation.phi(self.data_s[i]) self.all_phi_ns[i, :] = self.representation.phi( self.data_ns[i]) # build phi_s_a and phi_ns_na for all samples given phi_s and # phi_ns self.all_phi_s_a = self.representation.batch_phi_sa( self.all_phi_s[:self.samples_count, :], self.data_a[:self.samples_count, :], use_sparse=self.use_sparse, ) self.all_phi_ns_na = self.representation.batch_phi_sa( self.all_phi_ns[:self.samples_count, :], self.data_na[:self.samples_count, :], use_sparse=self.use_sparse, ) # calculate A and b for LSTD F1 = self.all_phi_s_a[:self.samples_count, :] F2 = self.all_phi_ns_na[:self.samples_count, :] R = self.data_r[:self.samples_count, :] discount_factor = self.discount_factor if self.use_sparse: self.b = (F1.T * R).reshape(-1, 1) self.A = F1.T * (F1 - discount_factor * F2) else: self.b = np.dot(F1.T, R).reshape(-1, 1) self.A = np.dot(F1.T, F1 - discount_factor * F2) A = tools.regularize(self.A) # Calculate weight_vec self.representation.weight_vec, solve_time = tools.solveLinear( A, self.b) # log solve time only if takes more than 1 second if solve_time > 1: self.logger.info( "Total LSTD Time = %0.0f(s), Solve Time = %0.0f(s)" % (tools.deltaT(start_time), solve_time)) else: self.logger.info("Total LSTD Time = %0.0f(s)" % (tools.deltaT(start_time)))
def policy_improvement(self, policy): """ Given a policy improve it by taking the greedy action in each state based on the value function. Returns the new policy. """ policyChanges = 0 i = 0 while i < self.representation.num_states_total and self.has_time(): s = self.representation.stateID2state(i) p_actions = self.domain.possible_actions(s) if not self.domain.is_terminal(s) and len( self.domain.possible_actions(s)): for a in self.domain.possible_actions(s): self.bellman_backup(s, a, self.ns_samples, policy) p_actions = self.domain.possible_actions(s=s) best_action = self.representation.best_action( s, False, p_actions) if policy.pi(s, False, p_actions) != best_action: policyChanges += 1 i += 1 # This will cause the policy to be copied over policy.representation.weight = self.representation.weight.copy() perf_return, perf_steps, perf_term, perf_disc_return = self.performance_run( ) self.logger.info( "PI #%d [%s]: BellmanUpdates=%d, Policy Change=%d, Return=%0.4f, Steps=%d" % ( self.policy_improvement_iteration, hhmmss(deltaT(self.start_time)), self.bellman_updates, policyChanges, perf_return, perf_steps, )) # store stats self.result["bellman_updates"].append(self.bellman_updates) self.result["return"].append(perf_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append(self.representation.features_num) self.result["steps"].append(perf_steps) self.result["terminated"].append(perf_term) self.result["discounted_return"].append(perf_disc_return) self.result["policy_improvement_iteration"].append( self.policy_improvement_iteration) return policy, policyChanges
def evaluate(self, total_steps, episode_number, visualize=0): """ Evaluate the current agent within an experiment :param total_steps: (int) number of steps used in learning so far :param episode_number: (int) number of episodes used in learning so far """ random_state = np.random.get_state() # random_state_domain = copy(self.domain.random_state) elapsedTime = deltaT(self.start_time) performance_return = 0.0 performance_steps = 0.0 performance_term = 0.0 performance_discounted_return = 0.0 for j in range(self.checks_per_policy): p_ret, p_step, p_term, p_dret = self.performance_run( total_steps, visualize=visualize > j) performance_return += p_ret performance_steps += p_step performance_term += p_term performance_discounted_return += p_dret performance_return /= self.checks_per_policy performance_steps /= self.checks_per_policy performance_term /= self.checks_per_policy performance_discounted_return /= self.checks_per_policy self.result["learning_steps"].append(total_steps) self.result["return"].append(performance_return) self.result["learning_time"].append(self.elapsed_time) self.result["num_features"].append( self.agent.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["learning_episode"].append(episode_number) self.result["discounted_return"].append(performance_discounted_return) # reset start time such that performanceRuns don't count self.start_time = clock() - elapsedTime if total_steps > 0: remaining = hhmmss(elapsedTime * (self.max_steps - total_steps) / total_steps) else: remaining = "?" self.logger.info( self.performance_log_template.format( total_steps=total_steps, elapsed=hhmmss(elapsedTime), remaining=remaining, totreturn=performance_return, steps=performance_steps, num_feat=self.agent.representation.features_num, )) np.random.set_state(random_state)
def traj_based_policy_evaluation(self, policy): """ Evaluate the current policy by simulating trajectories and update the value function along the visited states. """ PE_iteration = 0 evaluation_is_accurate = False converged_trajectories = 0 while (not evaluation_is_accurate and self.has_time() and PE_iteration < self.max_pe_iterations): # Generate a new episode e-greedy with the current values max_bellman_error = 0 step = 0 s, a, terminal = self.sample_ns_na(policy, start_trajectory=True) while not terminal and step < self.domain.episode_cap and self.has_time( ): bellman_error, phi_s, phi_s_a = self._bellman_error( s, a, terminal) # Update the value function using approximate bellman backup self.representation.weight_vec += self.alpha * bellman_error * phi_s_a self.bellman_updates += 1 step += 1 max_bellman_error = max(max_bellman_error, abs(bellman_error)) # Discover features if the representation has the discover method if hasattr(self.representation, "discover"): self.representation.post_discover(phi_s, bellman_error) s, a, terminal = self.sample_ns_na(policy, a) # check for convergence of policy evaluation PE_iteration += 1 if max_bellman_error < self.convergence_threshold: converged_trajectories += 1 else: converged_trajectories = 0 evaluation_is_accurate = (converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES) self.logger.info( "PE #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Features=%d" % ( PE_iteration, hhmmss(deltaT(self.start_time)), self.bellman_updates, max_bellman_error, self.representation.features_num, ))
def policyIteration(self): """Update the policy by recalculating A based on new na. Returns the TD error for each sample based on the latest weights and next actions. """ start_time = tools.clock() weight_diff = self.tol_epsilon + 1 # So that the loop starts lspi_iteration = 0 self.best_performance = -np.inf self.logger.info("Running Policy Iteration:") # We save action_mask on the first iteration (used for batch_best_action) to # reuse it and boost the speed # action_mask is a matrix that shows which actions are available for each state action_mask = None discount_factor = self.discount_factor F1 = (sp.csr_matrix(self.all_phi_s_a[:self.samples_count, :]) if self.use_sparse else self.all_phi_s_a[:self.samples_count, :]) while lspi_iteration < self.lspi_iterations and weight_diff > self.tol_epsilon: # Find the best action for each state given the current value function # Notice if actions have the same value the first action is # selected in the batch mode iteration_start_time = tools.clock() ( best_action, self.all_phi_ns_new_na, action_mask, ) = self.representation.batch_best_action( self.data_ns[:self.samples_count, :], self.all_phi_ns, action_mask, self.use_sparse, ) # Recalculate A matrix (b remains the same) # Solve for the new weight_vec if self.use_sparse: F2 = sp.csr_matrix( self.all_phi_ns_new_na[:self.samples_count, :]) A = F1.T * (F1 - discount_factor * F2) else: F2 = self.all_phi_ns_new_na[:self.samples_count, :] A = np.dot(F1.T, F1 - discount_factor * F2) A = tools.regularize(A) new_weight_vec, solve_time = tools.solveLinear(A, self.b) # Calculate TD_Errors #################### td_errors = self.calculateTDErrors() # Calculate the weight difference. If it is big enough update the # weight_vec weight_diff = np.linalg.norm(self.representation.weight_vec - new_weight_vec) if weight_diff > self.tol_epsilon: self.representation.weight_vec = new_weight_vec self.logger.info( "%d: %0.0f(s), ||w1-w2|| = %0.4f, Sparsity=%0.1f%%, %d Features" % ( lspi_iteration + 1, tools.deltaT(iteration_start_time), weight_diff, tools.sparsity(A), self.representation.features_num, )) lspi_iteration += 1 self.logger.info("Total Policy Iteration Time = %0.0f(s)" % tools.deltaT(start_time)) return td_errors
def run(self, visualize_performance=0, visualize_learning=False, visualize_steps=False): """ Run the experiment and collect statistics / generate the results :param visualize_performance: (int) determines whether a visualization of the steps taken in performance runs are shown. 0 means no visualization is shown. A value n > 0 means that only the first n performance runs for a specific policy are shown (i.e., for n < checks_per_policy, not all performance runs are shown) :param visualize_learning: (boolean) show some visualization of the learning status before each performance evaluation (e.g. Value function) :param visualize_steps: (boolean) visualize all steps taken during learning """ self.performance_domain = deepcopy(self.domain) self.performance_domain.performance = True self.seed_components() self.result = defaultdict(list) self.result["seed"] = self.exp_id total_steps = 0 eps_steps = 0 eps_return = 0 episode_number = 0 # show policy or value function of initial policy if visualize_learning: self.domain.show_learning(self.agent.representation) # Used to bound the number of logs in the file start_log_time = clock() # Used to show the total time took the process self.start_time = clock() self.elapsed_time = 0 # do a first evaluation to get the quality of the inital policy self.evaluate(total_steps, episode_number, visualize_performance) self.total_eval_time = 0.0 terminal = True while total_steps < self.max_steps: if terminal or eps_steps >= self.domain.episode_cap: s, terminal, p_actions = self.domain.s0() a = self.agent.policy.pi(s, terminal, p_actions) # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Output the current status if certain amount of time has been # passed eps_return = 0 eps_steps = 0 episode_number += 1 # Act,Step r, ns, terminal, np_actions = self.domain.step(a) self._gather_transition_statistics(s, a, ns, r, learning=True) na = self.agent.policy.pi(ns, terminal, np_actions) total_steps += 1 eps_steps += 1 eps_return += r # Print Current performance if (terminal or eps_steps == self.domain.episode_cap ) and deltaT(start_log_time) > self.log_interval: start_log_time = clock() elapsedTime = deltaT(self.start_time) self.logger.info( self.log_template.format( total_steps=total_steps, elapsed=hhmmss(elapsedTime), remaining=hhmmss(elapsedTime * (self.max_steps - total_steps) / total_steps), totreturn=eps_return, steps=eps_steps, num_feat=self.agent.representation.features_num, )) # learning self.agent.learn(s, p_actions, a, r, ns, np_actions, na, terminal) s, a, p_actions = ns, na, np_actions # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Check Performance if total_steps % (self.max_steps // self.num_policy_checks) == 0: self.elapsed_time = deltaT( self.start_time) - self.total_eval_time # show policy or value function if visualize_learning: self.domain.show_learning(self.agent.representation) self.evaluate(total_steps, episode_number, visualize_performance) self.total_eval_time += (deltaT(self.start_time) - self.elapsed_time - self.total_eval_time) start_log_time = clock() # Visual if visualize_steps: self.domain.show(a, self.agent.representation) self.logger.info("Total Experiment Duration %s" % (hhmmss(deltaT(self.start_time))))
def has_time(self): """Return a boolean stating if there is time left for planning.""" return deltaT(self.start_time) < self.planning_time
def solve_in_matrix_format(self): # while delta_weight_vec > threshold # 1. Gather data following an e-greedy policy # 2. Calculate A and b estimates # 3. calculate new_weight_vec, and delta_weight_vec # return policy greedy w.r.t last weight_vec self.policy = eGreedy(self.representation, epsilon=self.epsilon) # Number of samples to be used for each policy evaluation phase. L1 in # the Geramifard et. al. FTML 2012 paper self.samples_num = 1000 self.start_time = clock() # Used to track the total time for solving samples = 0 converged = False iteration = 0 while self.has_time() and not converged: # 1. Gather samples following an e-greedy policy S, Actions, NS, R, T = self.collect_samples(self.samples_num) samples += self.samples_num # 2. Calculate A and b estimates a_num = self.domain.num_actions n = self.representation.features_num discount_factor = self.domain.discount_factor self.A = np.zeros((n * a_num, n * a_num)) self.b = np.zeros((n * a_num, 1)) for i in range(self.samples_num): phi_s_a = self.representation.phi_sa(S[i], T[i], Actions[i, 0]).reshape( (-1, 1)) E_phi_ns_na = self.calculate_expected_phi_ns_na( S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1)) d = phi_s_a - discount_factor * E_phi_ns_na self.A += np.outer(phi_s_a, d.T) self.b += phi_s_a * R[i, 0] # 3. calculate new_weight_vec, and delta_weight_vec new_weight_vec, solve_time = solveLinear(regularize(self.A), self.b) iteration += 1 if solve_time > 1: self.logger.info( "#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)" % (iteration, solve_time)) weight_diff = l_norm(new_weight_vec - self.representation.weight_vec) converged = weight_diff < self.convergence_threshold self.representation.weight_vec = new_weight_vec ( perf_return, perf_steps, perf_term, perf_disc_return, ) = self.performance_run() self.logger.info( "#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f" % ( iteration, hhmmss(deltaT(self.start_time)), samples, weight_diff, perf_return, )) if self._visualize_mode: self.domain.show_learning(self.representation) # store stats self.result["samples"].append(samples) self.result["return"].append(perf_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(perf_steps) self.result["terminated"].append(perf_term) self.result["discounted_return"].append(perf_disc_return) self.result["iteration"].append(iteration) if converged: self.logger.info("Converged!") self.log_value()
def _solve_impl(self): """Solve the domain MDP.""" self.start_time = clock() # Used to track the total time for solving self.bellman_updates = 0 converged = False PI_iteration = 0 # The policy is maintained as separate copy of the representation. # This way as the representation is updated the policy remains intact policy = eGreedy(deepcopy(self.representation), epsilon=0, deterministic=True) a_num = self.domain.num_actions while self.has_time() and not converged: # Policy Improvement (Updating the representation of the value) self.traj_based_policy_evaluation(policy) PI_iteration += 1 # Theta can increase in size if the representation # is expanded hence padding the weight vector with zeros additional_dim = (self.representation.features_num - policy.representation.features_num) padded_theta = np.hstack( (policy.representation.weight, np.zeros( (a_num, additional_dim)))) # Calculate the change in the weight_vec as L2-norm weight_diff = np.linalg.norm(padded_theta - self.representation.weight) converged = weight_diff < self.convergence_threshold # Update the underlying value function of the policy policy.representation = deepcopy( self.representation) # self.representation ( perf_return, perf_steps, perf_term, perf_disc_return, ) = self.performance_run() self.logger.info( "PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, " "Return=%0.3f, steps=%d, features=%d" % ( PI_iteration, hhmmss(deltaT(self.start_time)), self.bellman_updates, weight_diff, perf_return, perf_steps, self.representation.features_num, )) if self._visualize_mode: self.domain.show_learning(self.representation) # store stats self.result["bellman_updates"].append(self.bellman_updates) self.result["return"].append(perf_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(perf_steps) self.result["terminated"].append(perf_term) self.result["discounted_return"].append(perf_disc_return) self.result["policy_improvemnt_iteration"].append(PI_iteration) if converged: self.logger.info("Converged!") self.log_value()
def policy_evaluation(self, policy): """ Evaluate a given policy: this is done by applying the Bellman backup over all states until the change is less than a given threshold. Returns: convergence status as a boolean """ converged = False policy_evaluation_iteration = 0 while (not converged and self.has_time() and policy_evaluation_iteration < self.max_pe_iterations): policy_evaluation_iteration += 1 # Sweep The State Space for i in range(0, self.representation.num_states_total): # Check for solver time if not self.has_time(): break # Map an state ID to state s = self.representation.stateID2state(i) # Skip terminal states and states with no possible action possible_actions = self.domain.possible_actions(s=s) if self.domain.is_terminal(s) or len(possible_actions) == 0: continue # Apply Bellman Backup self.bellman_backup(s, policy.pi(s, False, possible_actions), self.ns_samples, policy) # Update number of backups self.bellman_updates += 1 # Check for the performance if self.bellman_updates % self.log_interval == 0: performance_return = self.performance_run()[0] self.logger.info("[%s]: BellmanUpdates=%d, Return=%0.4f" % ( hhmmss(deltaT(self.start_time)), self.bellman_updates, performance_return, )) # check for convergence: L_infinity norm of the difference between the to # the weight vector of representation weight_diff = l_norm(policy.representation.weight - self.representation.weight) converged = weight_diff < self.convergence_threshold # Log Status self.logger.info( "PE #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f" % ( policy_evaluation_iteration, hhmmss(deltaT(self.start_time)), self.bellman_updates, weight_diff, )) # Show Plots if self._visualize_mode: self.domain.show_learning(self.representation) return converged
def _solve_impl(self): """Solve the domain MDP.""" self.start_time = clock( ) # Used to show the total time took the process bellman_updates = 0 # used to track the performance improvement. converged = False iteration = 0 num_states = self.representation.num_states_total while self.has_time() and not converged: iteration += 1 # Store the weight vector for comparison prev_weight = self.representation.weight.copy() # Sweep The State Space for i in range(num_states): s = self.representation.stateID2state(i) # Sweep through possible actions if self.domain.is_terminal(s): continue for a in self.domain.possible_actions(s): self.bellman_backup(s, a, ns_samples=self.ns_samples) bellman_updates += 1 # Create Log if bellman_updates % self.log_interval == 0: performance_return, _, _, _ = self.performance_run() self._log_updates(performance_return, bellman_updates) # check for convergence weight_diff = l_norm(prev_weight - self.representation.weight) converged = weight_diff < self.convergence_threshold # log the stats ( perf_return, perf_steps, perf_term, perf_disc_return, ) = self.performance_run() self.logger.info( "PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, " "Return=%0.4f, Steps=%d" % ( iteration, hhmmss(deltaT(self.start_time)), bellman_updates, weight_diff, perf_return, perf_steps, )) # Show the domain and value function if self._visualize_mode: self.domain.show_learning(self.representation) # store stats self.result["bellman_updates"].append(bellman_updates) self.result["return"].append(perf_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(perf_steps) self.result["terminated"].append(perf_term) self.result["discounted_return"].append(perf_disc_return) self.result["iteration"].append(iteration) if converged: self.logger.info("Converged!") self.log_value()
def _log_updates(self, perf_return, bellman_updates): dt = hhmmss(deltaT(self.start_tim)) self.logger.info("[%s]: BellmanUpdates=%d, Return=%0.4f" % (dt, bellman_updates, perf_return))
def _solve_impl(self): """Solve the domain MDP.""" # Used to show the total time took the process self.start_time = clock() bellman_updates = 0 converged = False iteration = 0 # Track the number of consequent trajectories with very small observed # BellmanError converged_trajectories = 0 while self.has_time() and not converged: max_bellman_error = 0 step = 0 s, terminal, p_actions = self.domain.s0() # Generate a new episode e-greedy with the current values while not terminal and step < self.domain.episode_cap and self.has_time( ): a = self.eps_greedy(s, terminal, p_actions) bellman_error, phi_s, phi_s_a = self._bellman_error( s, a, terminal) # Update Parameters self.representation.weight_vec += self.alpha * bellman_error * phi_s_a bellman_updates += 1 step += 1 # Discover features if the representation has the discover method if hasattr(self.representation, "discover"): self.representation.post_discover(phi_s, bellman_error) max_bellman_error = max(max_bellman_error, abs(bellman_error)) # Simulate new state and action on trajectory _, s, terminal, p_actions = self.domain.step(a) # check for convergence iteration += 1 if max_bellman_error < self.convergence_threshold: converged_trajectories += 1 else: converged_trajectories = 0 ( perf_return, perf_steps, perf_term, perf_disc_return, ) = self.performance_run() converged = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES self.logger.info( "PI #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Return=%0.4f," "Steps=%d, Features=%d" % ( iteration, hhmmss(deltaT(self.start_time)), bellman_updates, max_bellman_error, perf_return, perf_steps, self.representation.features_num, )) if self._visualize_mode: self.domain.show_learning(self.representation) # store stats self.result["bellman_updates"].append(bellman_updates) self.result["return"].append(perf_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(perf_steps) self.result["terminated"].append(perf_term) self.result["discounted_return"].append(perf_disc_return) self.result["iteration"].append(iteration) if converged: self.logger.info("Converged!") self.log_value()