def policyEvaluation(self, policy): ''' Evaluate a given policy: this is done by applying the Bellman backup over all states until the change is less than a given threshold. Returns: convergence status as a boolean ''' converged = False policy_evaluation_iteration = 0 while (not converged and self.hasTime() and policy_evaluation_iteration < self.max_PE_iterations): policy_evaluation_iteration += 1 # Sweep The State Space for i in xrange(0, self.representation.agg_states_num): # Check for solver time if not self.hasTime(): break # Map an state ID to state s = self.representation.stateID2state(i) # Skip terminal states and states with no possible action possible_actions = self.domain.possibleActions(s=s) if (self.domain.isTerminal(s) or len(possible_actions) == 0): continue # Apply Bellman Backup self.BellmanBackup(s, policy.pi(s, False, possible_actions), self.ns_samples, policy) # Update number of backups self.bellmanUpdates += 1 # Check for the performance if self.bellmanUpdates % self.log_interval == 0: performance_return = self.performanceRun()[0] self.logger.info('[%s]: BellmanUpdates=%d, Return=%0.4f' % (hhmmss(deltaT(self.start_time)), self.bellmanUpdates, performance_return)) # check for convergence: L_infinity norm of the difference between the to the weight vector of representation weight_vec_change = l_norm( policy.representation.weight_vec - self.representation.weight_vec, np.inf) converged = weight_vec_change < self.convergence_threshold # Log Status self.logger.info( 'PE #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f' % (policy_evaluation_iteration, hhmmss(deltaT( self.start_time)), self.bellmanUpdates, weight_vec_change)) # Show Plots if self.show: self.domain.showDomain(s=s, filename="policy") self.domain.showLearning(self.representation, filename="policy") return converged
def evaluate(self, total_steps, episode_number, visualize=0): """ Evaluate the current agent within an experiment :param total_steps: (int) number of steps used in learning so far :param episode_number: (int) number of episodes used in learning so far """ # TODO resolve this hack if className(self.agent) == 'PolicyEvaluation': # Policy Evaluation Case self.result = self.agent.STATS return random_state = np.random.get_state() #random_state_domain = copy(self.domain.random_state) elapsedTime = deltaT(self.start_time) performance_return = 0. performance_steps = 0. performance_term = 0. performance_discounted_return = 0. for j in xrange(self.checks_per_policy): p_ret, p_step, p_term, p_dret = self.performanceRun( total_steps, visualize=visualize > j) performance_return += p_ret performance_steps += p_step performance_term += p_term performance_discounted_return += p_dret performance_return /= self.checks_per_policy performance_steps /= self.checks_per_policy performance_term /= self.checks_per_policy performance_discounted_return /= self.checks_per_policy self.result["learning_steps"].append(total_steps) self.result["return"].append(performance_return) self.result["learning_time"].append(self.elapsed_time) self.result["num_features"].append( self.agent.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["learning_episode"].append(episode_number) self.result["discounted_return"].append(performance_discounted_return) # reset start time such that performanceRuns don't count self.start_time = clock() - elapsedTime if total_steps > 0: remaining = hhmmss(elapsedTime * (self.max_steps - total_steps) / total_steps) else: remaining = "?" self.logger.info( self.performance_log_template.format( total_steps=total_steps, elapsed=hhmmss(elapsedTime), remaining=remaining, totreturn=performance_return, steps=performance_steps, num_feat=self.agent.representation.features_num)) np.random.set_state(random_state)
def evaluate(self, total_steps, episode_number, visualize=0): """ Evaluate the current agent within an experiment :param total_steps: (int) number of steps used in learning so far :param episode_number: (int) number of episodes used in learning so far """ # TODO resolve this hack if className(self.agent) == 'PolicyEvaluation': # Policy Evaluation Case self.result = self.agent.STATS return random_state = np.random.get_state() #random_state_domain = copy(self.domain.random_state) elapsedTime = deltaT(self.start_time) performance_return = 0. performance_steps = 0. performance_term = 0. performance_discounted_return = 0. for j in xrange(self.checks_per_policy): p_ret, p_step, p_term, p_dret = self.performanceRun( total_steps, visualize=visualize > j) performance_return += p_ret performance_steps += p_step performance_term += p_term performance_discounted_return += p_dret performance_return /= self.checks_per_policy performance_steps /= self.checks_per_policy performance_term /= self.checks_per_policy performance_discounted_return /= self.checks_per_policy self.result["learning_steps"].append(total_steps) self.result["return"].append(performance_return) self.result["learning_time"].append(self.elapsed_time) self.result["num_features"].append( self.agent.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["learning_episode"].append(episode_number) self.result["discounted_return"].append(performance_discounted_return) # reset start time such that performanceRuns don't count self.start_time = clock() - elapsedTime if total_steps > 0: remaining = hhmmss( elapsedTime * (self.max_steps - total_steps) / total_steps) else: remaining = "?" self.logger.info( self.performance_log_template.format(total_steps=total_steps, elapsed=hhmmss( elapsedTime), remaining=remaining, totreturn=performance_return, steps=performance_steps, num_feat=self.agent.representation.features_num)) np.random.set_state(random_state)
def solve(self): """Solve the domain MDP.""" self.start_time = clock() # Used to track the total time for solving self.bellmanUpdates = 0 converged = False PI_iteration = 0 # The policy is maintained as separate copy of the representation. # This way as the representation is updated the policy remains intact policy = eGreedy(deepcopy(self.representation), epsilon=0, forcedDeterministicAmongBestActions=True) while self.hasTime() and not converged: self.trajectoryBasedPolicyEvaluation(policy) # Policy Improvement (Updating the representation of the value # function will automatically improve the policy PI_iteration += 1 # Theta can increase in size if the representation is expanded hence padding the weight vector with zeros paddedTheta = padZeros(policy.representation.weight_vec, len(self.representation.weight_vec)) # Calculate the change in the weight_vec as L2-norm delta_weight_vec = np.linalg.norm(paddedTheta - self.representation.weight_vec) converged = delta_weight_vec < self.convergence_threshold # Update the underlying value function of the policy policy.representation = deepcopy( self.representation) # self.representation performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun( ) self.logger.info( 'PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, Return=%0.3f, steps=%d, features=%d' % (PI_iteration, hhmmss(deltaT(self.start_time)), self.bellmanUpdates, delta_weight_vec, performance_return, performance_steps, self.representation.features_num)) if self.show: self.domain.show(a, representation=self.representation, s=s) # store stats self.result["bellman_updates"].append(self.bellmanUpdates) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append( performance_discounted_return) self.result["policy_improvemnt_iteration"].append(PI_iteration) if converged: self.logger.info('Converged!') super(TrajectoryBasedPolicyIteration, self).solve()
def trajectoryBasedPolicyEvaluation(self, policy): ''' evaluate the current policy by simulating trajectories and update the value function along the visited states ''' PE_iteration = 0 evaluation_is_accurate = False converged_trajectories = 0 while not evaluation_is_accurate and self.hasTime( ) and PE_iteration < self.max_PE_iterations: # Generate a new episode e-greedy with the current values max_Bellman_Error = 0 step = 0 s, a, terminal = self.sample_ns_na(policy, start_trajectory=True) while not terminal and step < self.domain.episodeCap and self.hasTime( ): new_Q = self.representation.Q_oneStepLookAhead( s, a, self.ns_samples, policy) phi_s = self.representation.phi(s, terminal) phi_s_a = self.representation.phi_sa(s, terminal, a, phi_s=phi_s) old_Q = np.dot(phi_s_a, self.representation.weight_vec) bellman_error = new_Q - old_Q # Update the value function using approximate bellman backup self.representation.weight_vec += (self.alpha * bellman_error * phi_s_a) self.bellmanUpdates += 1 step += 1 max_Bellman_Error = max(max_Bellman_Error, abs(bellman_error)) # Discover features if the representation has the discover method discover_func = getattr( self.representation, 'discover', None ) # None is the default value if the discover is not an attribute if discover_func and callable(discover_func): self.representation.post_discover(phi_s, bellman_error) # if discovered: # print "Features = %d" % self.representation.features_num s, a, terminal = self.sample_ns_na(policy, a) # check for convergence of policy evaluation PE_iteration += 1 if max_Bellman_Error < self.convergence_threshold: converged_trajectories += 1 else: converged_trajectories = 0 evaluation_is_accurate = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES self.logger.info( 'PE #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Features=%d' % (PE_iteration, hhmmss(deltaT( self.start_time)), self.bellmanUpdates, max_Bellman_Error, self.representation.features_num))
def policyImprovement(self, policy): ''' Given a policy improve it by taking the greedy action in each state based on the value function Returns the new policy ''' policyChanges = 0 i = 0 while i < self.representation.agg_states_num and self.hasTime(): s = self.representation.stateID2state(i) if not self.domain.isTerminal(s) and len( self.domain.possibleActions(s)): for a in self.domain.possibleActions(s): if not self.hasTime(): break self.BellmanBackup(s, a, self.ns_samples, policy) if policy.pi(s, False, self.domain.possibleActions( s=s)) != self.representation.bestAction( s, False, self.domain.possibleActions(s=s)): policyChanges += 1 i += 1 # This will cause the policy to be copied over policy.representation.weight_vec = self.representation.weight_vec.copy( ) performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun( ) self.logger.info( 'PI #%d [%s]: BellmanUpdates=%d, Policy Change=%d, Return=%0.4f, Steps=%d' % (self.policy_improvement_iteration, hhmmss(deltaT(self.start_time)), self.bellmanUpdates, policyChanges, performance_return, performance_steps)) # store stats self.result["bellman_updates"].append(self.bellmanUpdates) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append(self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append(performance_discounted_return) self.result["policy_improvement_iteration"].append( self.policy_improvement_iteration) return policy, policyChanges
def policyImprovement(self, policy): ''' Given a policy improve it by taking the greedy action in each state based on the value function Returns the new policy ''' policyChanges = 0 i = 0 while i < self.representation.agg_states_num and self.hasTime(): s = self.representation.stateID2state(i) if not self.domain.isTerminal(s) and len(self.domain.possibleActions(s)): for a in self.domain.possibleActions(s): if not self.hasTime(): break self.BellmanBackup(s, a, self.ns_samples, policy) if policy.pi(s, False, self.domain.possibleActions(s=s)) != self.representation.bestAction(s, False, self.domain.possibleActions(s=s)): policyChanges += 1 i += 1 # This will cause the policy to be copied over policy.representation.weight_vec = self.representation.weight_vec.copy() performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun( ) self.logger.info( 'PI #%d [%s]: BellmanUpdates=%d, Policy Change=%d, Return=%0.4f, Steps=%d' % ( self.policy_improvement_iteration, hhmmss( deltaT( self.start_time)), self.bellmanUpdates, policyChanges, performance_return, performance_steps)) # store stats self.result["bellman_updates"].append(self.bellmanUpdates) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append(self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append(performance_discounted_return) self.result["policy_improvement_iteration"].append(self.policy_improvement_iteration) return policy, policyChanges
def solve(self): """Solve the domain MDP.""" # Used to show the total time took the process self.start_time = clock() bellmanUpdates = 0 converged = False iteration = 0 # Track the number of consequent trajectories with very small observed # BellmanError converged_trajectories = 0 while self.hasTime() and not converged: # Generate a new episode e-greedy with the current values max_Bellman_Error = 0 step = 0 terminal = False s, terminal, p_actions = self.domain.s0() a = self.representation.bestAction( s, terminal, p_actions ) if np.random.rand() > self.epsilon else randSet(p_actions) while not terminal and step < self.domain.episodeCap and self.hasTime( ): new_Q = self.representation.Q_oneStepLookAhead( s, a, self.ns_samples) phi_s = self.representation.phi(s, terminal) phi_s_a = self.representation.phi_sa(s, terminal, a, phi_s) old_Q = np.dot(phi_s_a, self.representation.weight_vec) bellman_error = new_Q - old_Q # print s, old_Q, new_Q, bellman_error self.representation.weight_vec += self.alpha * bellman_error * phi_s_a bellmanUpdates += 1 step += 1 # Discover features if the representation has the discover method discover_func = getattr( self.representation, 'discover', None ) # None is the default value if the discover is not an attribute if discover_func and callable(discover_func): self.representation.discover(phi_s, bellman_error) max_Bellman_Error = max(max_Bellman_Error, abs(bellman_error)) # Simulate new state and action on trajectory _, s, terminal, p_actions = self.domain.step(a) a = self.representation.bestAction( s, terminal, p_actions ) if np.random.rand() > self.epsilon else randSet(p_actions) # check for convergence iteration += 1 if max_Bellman_Error < self.convergence_threshold: converged_trajectories += 1 else: converged_trajectories = 0 performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun( ) converged = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES self.logger.info( 'PI #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Return=%0.4f, Steps=%d, Features=%d' % (iteration, hhmmss(deltaT(self.start_time)), bellmanUpdates, max_Bellman_Error, performance_return, performance_steps, self.representation.features_num)) if self.show: self.domain.show(a, representation=self.representation, s=s) # store stats self.result["bellman_updates"].append(bellmanUpdates) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append( performance_discounted_return) self.result["iteration"].append(iteration) if converged: self.logger.info('Converged!') super(TrajectoryBasedValueIteration, self).solve()
def solve(self): """Solve the domain MDP.""" self.start_time = clock() # Used to show the total time took the process bellmanUpdates = 0 # used to track the performance improvement. converged = False iteration = 0 # Check for Tabular Representation if not self.IsTabularRepresentation(): self.logger.error("Value Iteration works only with a tabular representation.") return 0 no_of_states = self.representation.agg_states_num while self.hasTime() and not converged: iteration += 1 # Store the weight vector for comparison prev_weight_vec = self.representation.weight_vec.copy() # Sweep The State Space for i in xrange(no_of_states): s = self.representation.stateID2state(i) # Sweep through possible actions for a in self.domain.possibleActions(s): # Check for available planning time if not self.hasTime(): break self.BellmanBackup(s, a, ns_samples=self.ns_samples) bellmanUpdates += 1 # Create Log if bellmanUpdates % self.log_interval == 0: performance_return, _, _, _ = self.performanceRun() self.logger.info( '[%s]: BellmanUpdates=%d, Return=%0.4f' % (hhmmss(deltaT(self.start_time)), bellmanUpdates, performance_return)) # check for convergence weight_vec_change = l_norm(prev_weight_vec - self.representation.weight_vec, np.inf) converged = weight_vec_change < self.convergence_threshold # log the stats performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun() self.logger.info( 'PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, Return=%0.4f, Steps=%d' % (iteration, hhmmss(deltaT(self.start_time)), bellmanUpdates, weight_vec_change, performance_return, performance_steps)) # Show the domain and value function if self.show: self.domain.show(a, s=s, representation=self.representation) # store stats self.result["bellman_updates"].append(bellmanUpdates) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append(self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append(performance_discounted_return) self.result["iteration"].append(iteration) if converged: self.logger.info('Converged!') super(ValueIteration, self).solve()
def solve(self): """Solve the domain MDP.""" # Used to show the total time took the process self.start_time = clock() bellmanUpdates = 0 converged = False iteration = 0 # Track the number of consequent trajectories with very small observed # BellmanError converged_trajectories = 0 while self.hasTime() and not converged: # Generate a new episode e-greedy with the current values max_Bellman_Error = 0 step = 0 terminal = False s, terminal, p_actions = self.domain.s0() a = self.representation.bestAction( s, terminal, p_actions) if np.random.rand( ) > self.epsilon else randSet( p_actions) while not terminal and step < self.domain.episodeCap and self.hasTime(): new_Q = self.representation.Q_oneStepLookAhead(s, a, self.ns_samples) phi_s = self.representation.phi(s, terminal) phi_s_a = self.representation.phi_sa(s, terminal, a, phi_s) old_Q = np.dot(phi_s_a, self.representation.weight_vec) bellman_error = new_Q - old_Q # print s, old_Q, new_Q, bellman_error self.representation.weight_vec += self.alpha * bellman_error * phi_s_a bellmanUpdates += 1 step += 1 # Discover features if the representation has the discover method discover_func = getattr(self.representation, 'discover', None) # None is the default value if the discover is not an attribute if discover_func and callable(discover_func): self.representation.discover(phi_s, bellman_error) max_Bellman_Error = max(max_Bellman_Error, abs(bellman_error)) # Simulate new state and action on trajectory _, s, terminal, p_actions = self.domain.step(a) a = self.representation.bestAction(s, terminal, p_actions) if np.random.rand() > self.epsilon else randSet(p_actions) # check for convergence iteration += 1 if max_Bellman_Error < self.convergence_threshold: converged_trajectories += 1 else: converged_trajectories = 0 performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun( ) converged = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES self.logger.info( 'PI #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Return=%0.4f, Steps=%d, Features=%d' % (iteration, hhmmss( deltaT( self.start_time)), bellmanUpdates, max_Bellman_Error, performance_return, performance_steps, self.representation.features_num)) if self.show: self.domain.show(a, representation=self.representation, s=s) # store stats self.result["bellman_updates"].append(bellmanUpdates) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append(self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append(performance_discounted_return) self.result["iteration"].append(iteration) if converged: self.logger.info('Converged!') super(TrajectoryBasedValueIteration, self).solve()
def run(self, performance_domain=None, visualize_performance=0, visualize_learning=False, visualize_steps=False, debug_on_sigurg=False): """ Run the experiment and collect statistics / generate the results :param visualize_performance: (int) determines whether a visualization of the steps taken in performance runs are shown. 0 means no visualization is shown. A value n > 0 means that only the first n performance runs for a specific policy are shown (i.e., for n < checks_per_policy, not all performance runs are shown) :param visualize_learning: (boolean) show some visualization of the learning status before each performance evaluation (e.g. Value function) :param visualize_steps: (boolean) visualize all steps taken during learning :param debug_on_sigurg: (boolean) if true, the ipdb debugger is opened when the python process receives a SIGURG signal. This allows to enter a debugger at any time, e.g. to view data interactively or actual debugging. The feature works only in Unix systems. The signal can be sent with the kill command: kill -URG pid where pid is the process id of the python interpreter running this function. """ if debug_on_sigurg: rlpy.Tools.ipshell.ipdb_on_SIGURG() if performance_domain == None: self.performance_domain = deepcopy(self.domain) else: self.performance_domain = deepcopy(performance_domain) self.seed_components() self.result = defaultdict(list) self.result["seed"] = self.exp_id total_steps = 0 eps_steps = 0 eps_return = 0 episode_number = 0 # show policy or value function of initial policy if visualize_learning: self.domain.showLearning(self.agent.representation) # Used to bound the number of logs in the file start_log_time = clock() # Used to show the total time took the process self.start_time = clock() self.elapsed_time = 0 # do a first evaluation to get the quality of the inital policy self.all_experiment_list = [] self.evaluate(total_steps, episode_number, visualize_performance) self.total_eval_time = 0. terminal = True curr_experiment_list = [] while total_steps < self.max_steps: if terminal or eps_steps >= self.domain.episodeCap: # if curr_experiment_list!=[]: # self.all_experiment_list.append(curr_experiment_list) curr_experiment_list = [] s, terminal, p_actions = self.domain.s0() a = self.agent.policy.pi(s, terminal, p_actions) # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Output the current status if certain amount of time has been # passed eps_return = 0 eps_steps = 0 episode_number += 1 # Act,Step curr_experiment_list.append((str(list(s)), str(a))) r, ns, terminal, np_actions = self.domain.step(a) self._gather_transition_statistics(s, a, ns, r, learning=True) na = self.agent.policy.pi(ns, terminal, np_actions) total_steps += 1 eps_steps += 1 eps_return += r # Print Current performance if (terminal or eps_steps == self.domain.episodeCap ) and deltaT(start_log_time) > self.log_interval: start_log_time = clock() elapsedTime = deltaT(self.start_time) self.logger.info( self.log_template.format( total_steps=total_steps, elapsed=hhmmss(elapsedTime), remaining=hhmmss(elapsedTime * (self.max_steps - total_steps) / total_steps), totreturn=eps_return, steps=eps_steps, num_feat=self.agent.representation.features_num)) # learning self.agent.learn(s, p_actions, a, r, ns, np_actions, na, terminal) s, a, p_actions = ns, na, np_actions # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Check Performance if total_steps % (self.max_steps / self.num_policy_checks) == 0: self.elapsed_time = deltaT( self.start_time) - self.total_eval_time # show policy or value function if visualize_learning: self.domain.showLearning(self.agent.representation) self.evaluate(total_steps, episode_number, visualize_performance) self.total_eval_time += deltaT(self.start_time) - \ self.elapsed_time - \ self.total_eval_time start_log_time = clock() # Visual if visualize_steps: self.domain.show(a, self.agent.representation) self.logger.info("Total Experiment Duration %s" % (hhmmss(deltaT(self.start_time))))
def evaluate(self, total_steps, episode_number, visualize=0): """ Evaluate the current agent within an experiment :param total_steps: (int) number of steps used in learning so far :param episode_number: (int) number of episodes used in learning so far """ print "Stepsize: %f" % self.agent.learn_rate np.set_printoptions(formatter={'float': '{: 0.3f}'.format}) random_state = np.random.get_state() #random_state_domain = copy(self.domain.random_state) elapsedTime = deltaT(self.start_time) performance_return = 0. performance_steps = 0. performance_term = 0. performance_discounted_return = 0. performance_return_squared = 0. for j in xrange(self.checks_per_policy): p_ret, p_step, p_term, p_dret = self.performanceRun( total_steps, visualize=visualize > j) print j, p_ret performance_return += p_ret performance_steps += p_step performance_term += p_term performance_discounted_return += p_dret performance_return_squared += p_ret**2 performance_return /= self.checks_per_policy performance_return_squared /= self.checks_per_policy performance_steps /= self.checks_per_policy performance_term /= self.checks_per_policy performance_discounted_return /= self.checks_per_policy std_return = np.sqrt(performance_return_squared - (performance_return)**2) self.result["learning_steps"].append(total_steps) self.result["return"].append(performance_return) self.result["return_std"].append(std_return) self.result["learning_time"].append(self.elapsed_time) self.result["num_features"].append( self.agent.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["learning_episode"].append(episode_number) self.result["discounted_return"].append(performance_discounted_return) # reset start time such that performanceRuns don't count self.start_time = clock() - elapsedTime if total_steps > 0: remaining = hhmmss(elapsedTime * (self.max_steps - total_steps) / total_steps) else: remaining = "?" self.logger.info( self.performance_log_template.format( episode=episode_number, total_steps=total_steps, # elapsed=hhmmss( # elapsedTime), # remaining=remaining, totreturn=performance_return, stdreturn=std_return, #TODO steps=performance_steps, num_feat=self.agent.representation.features_num)) np.random.set_state(random_state)
def solveInMatrixFormat(self): # while delta_weight_vec > threshold # 1. Gather data following an e-greedy policy # 2. Calculate A and b estimates # 3. calculate new_weight_vec, and delta_weight_vec # return policy greedy w.r.t last weight_vec self.policy = eGreedy( self.representation, epsilon=self.epsilon) # Number of samples to be used for each policy evaluation phase. L1 in # the Geramifard et. al. FTML 2012 paper self.samples_num = 1000 self.start_time = clock() # Used to track the total time for solving samples = 0 converged = False iteration = 0 while self.hasTime() and not converged: # 1. Gather samples following an e-greedy policy S, Actions, NS, R, T = self.policy.collectSamples(self.samples_num) samples += self.samples_num # 2. Calculate A and b estimates a_num = self.domain.actions_num n = self.representation.features_num discount_factor = self.domain.discount_factor self.A = np.zeros((n * a_num, n * a_num)) self.b = np.zeros((n * a_num, 1)) for i in xrange(self.samples_num): phi_s_a = self.representation.phi_sa( S[i], Actions[i, 0]).reshape((-1, 1)) E_phi_ns_na = self.calculate_expected_phi_ns_na( S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1)) d = phi_s_a - discount_factor * E_phi_ns_na self.A += np.outer(phi_s_a, d.T) self.b += phi_s_a * R[i, 0] # 3. calculate new_weight_vec, and delta_weight_vec new_weight_vec, solve_time = solveLinear(regularize(self.A), self.b) iteration += 1 if solve_time > 1: self.logger.info( '#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)' % (iteration, solve_time)) delta_weight_vec = l_norm(new_weight_vec - self.representation.weight_vec, np.inf) converged = delta_weight_vec < self.convergence_threshold self.representation.weight_vec = new_weight_vec performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun() self.logger.info( '#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f' % (iteration, hhmmss(deltaT(self.start_time)), samples, delta_weight_vec, performance_return)) if self.show: self.domain.show(S[-1], Actions[-1], self.representation) # store stats self.result["samples"].append(samples) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append(self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append(performance_discounted_return) self.result["iteration"].append(iteration) if converged: self.logger.info('Converged!') super(TrajectoryBasedPolicyIteration, self).solve()
def policyEvaluation(self, policy): ''' Evaluate a given policy: this is done by applying the Bellman backup over all states until the change is less than a given threshold. Returns: convergence status as a boolean ''' converged = False policy_evaluation_iteration = 0 while (not converged and self.hasTime() and policy_evaluation_iteration < self.max_PE_iterations ): policy_evaluation_iteration += 1 # Sweep The State Space for i in range(0, self.representation.agg_states_num): # Check for solver time if not self.hasTime(): break # Map an state ID to state s = self.representation.stateID2state(i) # Skip terminal states and states with no possible action possible_actions = self.domain.possibleActions(s=s) if (self.domain.isTerminal(s) or len(possible_actions) == 0): continue # Apply Bellman Backup self.BellmanBackup( s, policy.pi(s, False, possible_actions), self.ns_samples, policy) # Update number of backups self.bellmanUpdates += 1 # Check for the performance if self.bellmanUpdates % self.log_interval == 0: performance_return = self.performanceRun()[0] self.logger.info( '[%s]: BellmanUpdates=%d, Return=%0.4f' % (hhmmss(deltaT(self.start_time)), self.bellmanUpdates, performance_return)) # check for convergence: L_infinity norm of the difference between the to the weight vector of representation weight_vec_change = l_norm(policy.representation.weight_vec - self.representation.weight_vec, np.inf) converged = weight_vec_change < self.convergence_threshold # Log Status self.logger.info( 'PE #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f' % (policy_evaluation_iteration, hhmmss(deltaT(self.start_time)), self.bellmanUpdates, weight_vec_change)) # Show Plots if self.show: self.domain.show( policy.pi(s, False, possible_actions), self.representation, s=s) return converged
def trajectoryBasedPolicyEvaluation(self, policy): ''' evaluate the current policy by simulating trajectories and update the value function along the visited states ''' PE_iteration = 0 evaluation_is_accurate = False converged_trajectories = 0 while not evaluation_is_accurate and self.hasTime() and PE_iteration < self.max_PE_iterations: # Generate a new episode e-greedy with the current values max_Bellman_Error = 0 step = 0 s, a, terminal = self.sample_ns_na(policy, start_trajectory=True) while not terminal and step < self.domain.episodeCap and self.hasTime(): new_Q = self.representation.Q_oneStepLookAhead(s, a, self.ns_samples, policy) phi_s = self.representation.phi(s, terminal) phi_s_a = self.representation.phi_sa(s, terminal, a, phi_s=phi_s) old_Q = np.dot(phi_s_a, self.representation.weight_vec) bellman_error = new_Q - old_Q # Update the value function using approximate bellman backup self.representation.weight_vec += (self.alpha * bellman_error * phi_s_a) self.bellmanUpdates += 1 step += 1 max_Bellman_Error = max(max_Bellman_Error, abs(bellman_error)) # Discover features if the representation has the discover method discover_func = getattr(self.representation, 'discover', None) # None is the default value if the discover is not an attribute if discover_func and callable(discover_func): self.representation.post_discover(phi_s, bellman_error) # if discovered: # print "Features = %d" % self.representation.features_num s, a, terminal = self.sample_ns_na(policy, a) # check for convergence of policy evaluation PE_iteration += 1 if max_Bellman_Error < self.convergence_threshold: converged_trajectories += 1 else: converged_trajectories = 0 evaluation_is_accurate = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES self.logger.info( 'PE #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Features=%d' % (PE_iteration, hhmmss( deltaT( self.start_time)), self. bellmanUpdates, max_Bellman_Error, self.representation.features_num))
def solve(self): """Solve the domain MDP.""" self.start_time = clock() # Used to track the total time for solving self.bellmanUpdates = 0 converged = False PI_iteration = 0 # The policy is maintained as separate copy of the representation. # This way as the representation is updated the policy remains intact policy = eGreedy( deepcopy(self.representation), epsilon=0, forcedDeterministicAmongBestActions=True) while self.hasTime() and not converged: self.trajectoryBasedPolicyEvaluation(policy) # Policy Improvement (Updating the representation of the value # function will automatically improve the policy PI_iteration += 1 # Theta can increase in size if the representation is expanded hence padding the weight vector with zeros paddedTheta = padZeros(policy.representation.weight_vec, len(self.representation.weight_vec)) # Calculate the change in the weight_vec as L2-norm delta_weight_vec = np.linalg.norm(paddedTheta - self.representation.weight_vec) converged = delta_weight_vec < self.convergence_threshold # Update the underlying value function of the policy policy.representation = deepcopy(self.representation) # self.representation performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun() self.logger.info( 'PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, Return=%0.3f, steps=%d, features=%d' % (PI_iteration, hhmmss( deltaT( self.start_time)), self.bellmanUpdates, delta_weight_vec, performance_return, performance_steps, self.representation.features_num)) if self.show: self.domain.show(a, representation=self.representation, s=s) # store stats self.result["bellman_updates"].append(self.bellmanUpdates) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append(self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append(performance_discounted_return) self.result["policy_improvemnt_iteration"].append(PI_iteration) if converged: self.logger.info('Converged!') super(TrajectoryBasedPolicyIteration, self).solve()
def solveInMatrixFormat(self): # while delta_weight_vec > threshold # 1. Gather data following an e-greedy policy # 2. Calculate A and b estimates # 3. calculate new_weight_vec, and delta_weight_vec # return policy greedy w.r.t last weight_vec self.policy = eGreedy(self.representation, epsilon=self.epsilon) # Number of samples to be used for each policy evaluation phase. L1 in # the Geramifard et. al. FTML 2012 paper self.samples_num = 1000 self.start_time = clock() # Used to track the total time for solving samples = 0 converged = False iteration = 0 while self.hasTime() and not converged: # 1. Gather samples following an e-greedy policy S, Actions, NS, R, T = self.collectSamples(self.samples_num) samples += self.samples_num # 2. Calculate A and b estimates a_num = self.domain.actions_num n = self.representation.features_num discount_factor = self.domain.discount_factor self.A = np.zeros((n * a_num, n * a_num)) self.b = np.zeros((n * a_num, 1)) for i in xrange(self.samples_num): phi_s_a = self.representation.phi_sa(S[i], T[i], Actions[i, 0]).reshape( (-1, 1)) E_phi_ns_na = self.calculate_expected_phi_ns_na( S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1)) d = phi_s_a - discount_factor * E_phi_ns_na self.A += np.outer(phi_s_a, d.T) self.b += phi_s_a * R[i, 0] # 3. calculate new_weight_vec, and delta_weight_vec new_weight_vec, solve_time = solveLinear(regularize(self.A), self.b) iteration += 1 if solve_time > 1: self.logger.info( '#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)' % (iteration, solve_time)) delta_weight_vec = l_norm( new_weight_vec - self.representation.weight_vec, np.inf) converged = delta_weight_vec < self.convergence_threshold self.representation.weight_vec = new_weight_vec performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun( ) self.logger.info( '#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f' % (iteration, hhmmss(deltaT(self.start_time)), samples, delta_weight_vec, performance_return)) if self.show: self.domain.show(S[-1], Actions[-1], self.representation) # store stats self.result["samples"].append(samples) self.result["return"].append(performance_return) self.result["planning_time"].append(deltaT(self.start_time)) self.result["num_features"].append( self.representation.features_num) self.result["steps"].append(performance_steps) self.result["terminated"].append(performance_term) self.result["discounted_return"].append( performance_discounted_return) self.result["iteration"].append(iteration) if converged: self.logger.info('Converged!') super(TrajectoryBasedPolicyIteration, self).solve()
def run(self, visualize_performance=0, visualize_learning=False, visualize_steps=False, debug_on_sigurg=False): """ Run the experiment and collect statistics / generate the results :param visualize_performance: (int) determines whether a visualization of the steps taken in performance runs are shown. 0 means no visualization is shown. A value n > 0 means that only the first n performance runs for a specific policy are shown (i.e., for n < checks_per_policy, not all performance runs are shown) :param visualize_learning: (boolean) show some visualization of the learning status before each performance evaluation (e.g. Value function) :param visualize_steps: (boolean) visualize all steps taken during learning :param debug_on_sigurg: (boolean) if true, the ipdb debugger is opened when the python process receives a SIGURG signal. This allows to enter a debugger at any time, e.g. to view data interactively or actual debugging. The feature works only in Unix systems. The signal can be sent with the kill command: kill -URG pid where pid is the process id of the python interpreter running this function. """ if debug_on_sigurg: rlpy.Tools.ipshell.ipdb_on_SIGURG() self.performance_domain = deepcopy(self.domain) self.seed_components() self.result = defaultdict(list) self.result["seed"] = self.exp_id total_steps = 0 eps_steps = 0 eps_return = 0 episode_number = 0 # show policy or value function of initial policy if visualize_learning: self.domain.showLearning(self.agent.representation) # Used to bound the number of logs in the file start_log_time = clock() # Used to show the total time took the process self.start_time = clock() self.elapsed_time = 0 # do a first evaluation to get the quality of the inital policy self.evaluate(total_steps, episode_number, visualize_performance) self.total_eval_time = 0. terminal = True while total_steps < self.max_steps: if terminal or eps_steps >= self.domain.episodeCap: s, terminal, p_actions = self.domain.s0() a = self.agent.policy.pi(s, terminal, p_actions) # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Output the current status if certain amount of time has been # passed eps_return = 0 eps_steps = 0 episode_number += 1 # Act,Step r, ns, terminal, np_actions = self.domain.step(a) self._gather_transition_statistics(s, a, ns, r, learning=True) na = self.agent.policy.pi(ns, terminal, np_actions) total_steps += 1 eps_steps += 1 eps_return += r # Print Current performance if (terminal or eps_steps == self.domain.episodeCap) and deltaT(start_log_time) > self.log_interval: start_log_time = clock() elapsedTime = deltaT(self.start_time) self.logger.info( self.log_template.format(total_steps=total_steps, elapsed=hhmmss( elapsedTime), remaining=hhmmss( elapsedTime * ( self.max_steps - total_steps) / total_steps), totreturn=eps_return, steps=eps_steps, num_feat=self.agent.representation.features_num)) # learning self.agent.learn(s, p_actions, a, r, ns, np_actions, na, terminal) s, a, p_actions = ns, na, np_actions # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Check Performance if total_steps % (self.max_steps / self.num_policy_checks) == 0: self.elapsed_time = deltaT( self.start_time) - self.total_eval_time # show policy or value function if visualize_learning: self.domain.showLearning(self.agent.representation) self.evaluate( total_steps, episode_number, visualize_performance) self.total_eval_time += deltaT(self.start_time) - \ self.elapsed_time - \ self.total_eval_time start_log_time = clock() # Visual if visualize_steps: self.domain.show(a, self.agent.representation) self.logger.info("Total Experiment Duration %s" % (hhmmss(deltaT(self.start_time))))
def run(self, visualize_performance=0, visualize_learning=False, visualize_steps=False, debug_on_sigurg=False): """ Run the experiment and collect statistics / generate the results :param visualize_performance: (int) determines whether a visualization of the steps taken in performance runs are shown. 0 means no visualization is shown. A value n > 0 means that only the first n performance runs for a specific policy are shown (i.e., for n < checks_per_policy, not all performance runs are shown) :param visualize_learning: (boolean) show some visualization of the learning status before each performance evaluation (e.g. Value function) :param visualize_steps: (boolean) visualize all steps taken during learning :param debug_on_sigurg: (boolean) if true, the ipdb debugger is opened when the python process receives a SIGURG signal. This allows to enter a debugger at any time, e.g. to view data interactively or actual debugging. The feature works only in Unix systems. The signal can be sent with the kill command: kill -URG pid where pid is the process id of the python interpreter running this function. """ if debug_on_sigurg: rlpy.Tools.ipshell.ipdb_on_SIGURG() self.performance_domain = deepcopy(self.domain) self.seed_components() self.result = defaultdict(list) self.result["seed"] = self.exp_id self.trials = defaultdict(list) self.trials["seed"] = self.exp_id total_steps = 0 eps_steps = 0 eps_return = 0 episode_number = 0 # show policy or value function of initial policy if visualize_learning: self.domain.showLearning(self.agent.representation) # Used to bound the number of logs in the file start_log_time = clock() # Used to show the total time took the process self.start_time = clock() self.elapsed_time = 0 # do a first evaluation to get the quality of the inital policy self.evaluate(total_steps, episode_number, visualize_performance) self.total_eval_time = 0. terminal = True # while total_steps < self.max_steps: while episode_number < self.max_episode: if terminal or eps_steps >= self.domain.episodeCap: counter = defaultdict(int) # if len(self.agent.policy._trajectory['options']): # sames = [max(Counter(x).values()) for x in self.agent.policy._trajectory['options']] # print "Variance of unanimous choices: {}".format(np.mean(sames)) self.agent.policy._trajectory = defaultdict(list) s, terminal, p_actions = self.domain.s0() # import ipdb; ipdb.set_trace() a = self.agent.policy.pi(s, terminal, p_actions) # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Output the current status if certain amount of time has been # passed eps_return = 0 eps_steps = 0 episode_number += 1 # Act,Step r, ns, terminal, np_actions = self.domain.step(a) counter[a] += 1 # print "Next state: (%0.3f, %0.3f, %0.3f, %0.3f) : %0.5f," % (ns[0], ns[1], ns[2], ns[3], self.agent.representation.V(ns, terminal, np_actions)) # if any(self.agent.representation.weight_vec > 0): # wns = np.argmax(self.agent.representation.weight_vec) # print self.agent.representation.weight_vec[wns] self.logger.debug(s, self.agent.representation.Qs(s, False)) # print ns, self.agent.representation.Qs(ns, False) # print "*" * 10 # self._gather_transition_statistics(s, a, ns, r, learning=True) na = self.agent.policy.pi(ns, terminal, np_actions) total_steps += 1 eps_steps += 1 eps_return += r # if total_steps == 60000: # Print Current performance if (terminal or eps_steps == self.domain.episodeCap): self.trials["learning_steps"].append(total_steps) self.trials["eps_steps"].append(eps_steps) self.trials["return"].append(eps_return) # print episode_number, eps_return self.trials["num_feat"].append( self.agent.representation.features_num) self.trials["learning_episode"].append(episode_number) self.trials["Action_0"].append(float(counter[0]) / eps_steps) self.trials["Action_1"].append(float(counter[1]) / eps_steps) if self.best_return is not None: self.trials["best_return"].append(self.best_return) self.update_best_representation(total_steps, episode_number, visualize_performance) # Check Performance if episode_number % (self.max_episode / self.num_policy_checks) == 0: self.elapsed_time = deltaT( self.start_time) - self.total_eval_time # show policy or value function if visualize_learning: self.domain.showLearning(self.agent.representation) # self.agent.policy.turn_on_printing() self.evaluate(total_steps, episode_number, visualize_performance) # self.agent.policy.turn_off_printing() self.total_eval_time += deltaT(self.start_time) - \ self.elapsed_time - \ self.total_eval_time start_log_time = clock() self.agent.learn(s, p_actions, a, r, ns, np_actions, na, terminal) s, a, p_actions = ns, na, np_actions # Visual if visualize_steps: self.domain.show(a, self.agent.representation) # Visual if visualize_steps: self.domain.show(a, self.agent.representation) self.logger.info("Total Experiment Duration %s" % (hhmmss(deltaT(self.start_time))))