Esempio n. 1
0
    def policyEvaluation(self, policy):
        '''
        Evaluate a given policy: this is done by applying the Bellman backup over all states until the change is less than
        a given threshold.

        Returns: convergence status as a boolean
        '''
        converged = False
        policy_evaluation_iteration = 0
        while (not converged and self.hasTime()
               and policy_evaluation_iteration < self.max_PE_iterations):
            policy_evaluation_iteration += 1

            # Sweep The State Space
            for i in xrange(0, self.representation.agg_states_num):

                # Check for solver time
                if not self.hasTime(): break

                # Map an state ID to state
                s = self.representation.stateID2state(i)

                # Skip terminal states and states with no possible action
                possible_actions = self.domain.possibleActions(s=s)
                if (self.domain.isTerminal(s) or len(possible_actions) == 0):
                    continue

                # Apply Bellman Backup
                self.BellmanBackup(s, policy.pi(s, False, possible_actions),
                                   self.ns_samples, policy)

                # Update number of backups
                self.bellmanUpdates += 1

                # Check for the performance
                if self.bellmanUpdates % self.log_interval == 0:
                    performance_return = self.performanceRun()[0]
                    self.logger.info('[%s]: BellmanUpdates=%d, Return=%0.4f' %
                                     (hhmmss(deltaT(self.start_time)),
                                      self.bellmanUpdates, performance_return))

            # check for convergence: L_infinity norm of the difference between the to the weight vector of representation
            weight_vec_change = l_norm(
                policy.representation.weight_vec -
                self.representation.weight_vec, np.inf)
            converged = weight_vec_change < self.convergence_threshold

            # Log Status
            self.logger.info(
                'PE #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f' %
                (policy_evaluation_iteration, hhmmss(deltaT(
                    self.start_time)), self.bellmanUpdates, weight_vec_change))

            # Show Plots
            if self.show:
                self.domain.showDomain(s=s, filename="policy")
                self.domain.showLearning(self.representation,
                                         filename="policy")

        return converged
Esempio n. 2
0
    def evaluate(self, total_steps, episode_number, visualize=0):
        """
        Evaluate the current agent within an experiment

        :param total_steps: (int)
                     number of steps used in learning so far
        :param episode_number: (int)
                        number of episodes used in learning so far
        """
        # TODO resolve this hack
        if className(self.agent) == 'PolicyEvaluation':
            # Policy Evaluation Case
            self.result = self.agent.STATS
            return

        random_state = np.random.get_state()
        #random_state_domain = copy(self.domain.random_state)
        elapsedTime = deltaT(self.start_time)
        performance_return = 0.
        performance_steps = 0.
        performance_term = 0.
        performance_discounted_return = 0.
        for j in xrange(self.checks_per_policy):
            p_ret, p_step, p_term, p_dret = self.performanceRun(
                total_steps, visualize=visualize > j)
            performance_return += p_ret
            performance_steps += p_step
            performance_term += p_term
            performance_discounted_return += p_dret
        performance_return /= self.checks_per_policy
        performance_steps /= self.checks_per_policy
        performance_term /= self.checks_per_policy
        performance_discounted_return /= self.checks_per_policy
        self.result["learning_steps"].append(total_steps)
        self.result["return"].append(performance_return)
        self.result["learning_time"].append(self.elapsed_time)
        self.result["num_features"].append(
            self.agent.representation.features_num)
        self.result["steps"].append(performance_steps)
        self.result["terminated"].append(performance_term)
        self.result["learning_episode"].append(episode_number)
        self.result["discounted_return"].append(performance_discounted_return)
        # reset start time such that performanceRuns don't count
        self.start_time = clock() - elapsedTime
        if total_steps > 0:
            remaining = hhmmss(elapsedTime * (self.max_steps - total_steps) /
                               total_steps)
        else:
            remaining = "?"
        self.logger.info(
            self.performance_log_template.format(
                total_steps=total_steps,
                elapsed=hhmmss(elapsedTime),
                remaining=remaining,
                totreturn=performance_return,
                steps=performance_steps,
                num_feat=self.agent.representation.features_num))

        np.random.set_state(random_state)
Esempio n. 3
0
    def evaluate(self, total_steps, episode_number, visualize=0):
        """
        Evaluate the current agent within an experiment

        :param total_steps: (int)
                     number of steps used in learning so far
        :param episode_number: (int)
                        number of episodes used in learning so far
        """
        # TODO resolve this hack
        if className(self.agent) == 'PolicyEvaluation':
            # Policy Evaluation Case
            self.result = self.agent.STATS
            return

        random_state = np.random.get_state()
        #random_state_domain = copy(self.domain.random_state)
        elapsedTime = deltaT(self.start_time)
        performance_return = 0.
        performance_steps = 0.
        performance_term = 0.
        performance_discounted_return = 0.
        for j in xrange(self.checks_per_policy):
            p_ret, p_step, p_term, p_dret = self.performanceRun(
                total_steps, visualize=visualize > j)
            performance_return += p_ret
            performance_steps += p_step
            performance_term += p_term
            performance_discounted_return += p_dret
        performance_return /= self.checks_per_policy
        performance_steps /= self.checks_per_policy
        performance_term /= self.checks_per_policy
        performance_discounted_return /= self.checks_per_policy
        self.result["learning_steps"].append(total_steps)
        self.result["return"].append(performance_return)
        self.result["learning_time"].append(self.elapsed_time)
        self.result["num_features"].append(
            self.agent.representation.features_num)
        self.result["steps"].append(performance_steps)
        self.result["terminated"].append(performance_term)
        self.result["learning_episode"].append(episode_number)
        self.result["discounted_return"].append(performance_discounted_return)
        # reset start time such that performanceRuns don't count
        self.start_time = clock() - elapsedTime
        if total_steps > 0:
            remaining = hhmmss(
                elapsedTime * (self.max_steps - total_steps) / total_steps)
        else:
            remaining = "?"
        self.logger.info(
            self.performance_log_template.format(total_steps=total_steps,
                                                 elapsed=hhmmss(
                                                     elapsedTime),
                                                 remaining=remaining,
                                                 totreturn=performance_return,
                                                 steps=performance_steps,
                                                 num_feat=self.agent.representation.features_num))

        np.random.set_state(random_state)
    def solve(self):
        """Solve the domain MDP."""

        self.start_time = clock()  # Used to track the total time for solving
        self.bellmanUpdates = 0
        converged = False
        PI_iteration = 0

        # The policy is maintained as separate copy of the representation.
        # This way as the representation is updated the policy remains intact
        policy = eGreedy(deepcopy(self.representation),
                         epsilon=0,
                         forcedDeterministicAmongBestActions=True)

        while self.hasTime() and not converged:

            self.trajectoryBasedPolicyEvaluation(policy)

            # Policy Improvement (Updating the representation of the value
            # function will automatically improve the policy
            PI_iteration += 1

            # Theta can increase in size if the representation is expanded hence padding the weight vector with zeros
            paddedTheta = padZeros(policy.representation.weight_vec,
                                   len(self.representation.weight_vec))

            # Calculate the change in the weight_vec as L2-norm
            delta_weight_vec = np.linalg.norm(paddedTheta -
                                              self.representation.weight_vec)
            converged = delta_weight_vec < self.convergence_threshold

            # Update the underlying value function of the policy
            policy.representation = deepcopy(
                self.representation)  # self.representation

            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun(
            )
            self.logger.info(
                'PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, Return=%0.3f, steps=%d, features=%d'
                % (PI_iteration, hhmmss(deltaT(self.start_time)),
                   self.bellmanUpdates, delta_weight_vec, performance_return,
                   performance_steps, self.representation.features_num))
            if self.show:
                self.domain.show(a, representation=self.representation, s=s)

            # store stats
            self.result["bellman_updates"].append(self.bellmanUpdates)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(
                self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(
                performance_discounted_return)
            self.result["policy_improvemnt_iteration"].append(PI_iteration)

        if converged:
            self.logger.info('Converged!')
        super(TrajectoryBasedPolicyIteration, self).solve()
    def trajectoryBasedPolicyEvaluation(self, policy):
        ''' evaluate the current policy by simulating trajectories and update the value function along the
        visited states
        '''
        PE_iteration = 0
        evaluation_is_accurate = False
        converged_trajectories = 0
        while not evaluation_is_accurate and self.hasTime(
        ) and PE_iteration < self.max_PE_iterations:

            # Generate a new episode e-greedy with the current values
            max_Bellman_Error = 0
            step = 0

            s, a, terminal = self.sample_ns_na(policy, start_trajectory=True)

            while not terminal and step < self.domain.episodeCap and self.hasTime(
            ):
                new_Q = self.representation.Q_oneStepLookAhead(
                    s, a, self.ns_samples, policy)
                phi_s = self.representation.phi(s, terminal)
                phi_s_a = self.representation.phi_sa(s,
                                                     terminal,
                                                     a,
                                                     phi_s=phi_s)
                old_Q = np.dot(phi_s_a, self.representation.weight_vec)
                bellman_error = new_Q - old_Q

                # Update the value function using approximate bellman backup
                self.representation.weight_vec += (self.alpha * bellman_error *
                                                   phi_s_a)
                self.bellmanUpdates += 1
                step += 1
                max_Bellman_Error = max(max_Bellman_Error, abs(bellman_error))

                # Discover features if the representation has the discover method
                discover_func = getattr(
                    self.representation, 'discover', None
                )  # None is the default value if the discover is not an attribute
                if discover_func and callable(discover_func):
                    self.representation.post_discover(phi_s, bellman_error)
                    # if discovered:
                    # print "Features = %d" % self.representation.features_num

                s, a, terminal = self.sample_ns_na(policy, a)

            # check for convergence of policy evaluation
            PE_iteration += 1
            if max_Bellman_Error < self.convergence_threshold:
                converged_trajectories += 1
            else:
                converged_trajectories = 0
            evaluation_is_accurate = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES
            self.logger.info(
                'PE #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Features=%d'
                % (PE_iteration, hhmmss(deltaT(
                    self.start_time)), self.bellmanUpdates, max_Bellman_Error,
                   self.representation.features_num))
Esempio n. 6
0
    def policyImprovement(self, policy):
        ''' Given a policy improve it by taking the greedy action in each state based on the value function
            Returns the new policy
        '''
        policyChanges = 0
        i = 0
        while i < self.representation.agg_states_num and self.hasTime():
            s = self.representation.stateID2state(i)
            if not self.domain.isTerminal(s) and len(
                    self.domain.possibleActions(s)):
                for a in self.domain.possibleActions(s):
                    if not self.hasTime():
                        break
                    self.BellmanBackup(s, a, self.ns_samples, policy)
                if policy.pi(s, False, self.domain.possibleActions(
                        s=s)) != self.representation.bestAction(
                            s, False, self.domain.possibleActions(s=s)):
                    policyChanges += 1
            i += 1
        # This will cause the policy to be copied over
        policy.representation.weight_vec = self.representation.weight_vec.copy(
        )
        performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun(
        )
        self.logger.info(
            'PI #%d [%s]: BellmanUpdates=%d, Policy Change=%d, Return=%0.4f, Steps=%d'
            % (self.policy_improvement_iteration,
               hhmmss(deltaT(self.start_time)), self.bellmanUpdates,
               policyChanges, performance_return, performance_steps))

        # store stats
        self.result["bellman_updates"].append(self.bellmanUpdates)
        self.result["return"].append(performance_return)
        self.result["planning_time"].append(deltaT(self.start_time))
        self.result["num_features"].append(self.representation.features_num)
        self.result["steps"].append(performance_steps)
        self.result["terminated"].append(performance_term)
        self.result["discounted_return"].append(performance_discounted_return)
        self.result["policy_improvement_iteration"].append(
            self.policy_improvement_iteration)
        return policy, policyChanges
Esempio n. 7
0
    def policyImprovement(self, policy):
        ''' Given a policy improve it by taking the greedy action in each state based on the value function
            Returns the new policy
        '''
        policyChanges = 0
        i = 0
        while i < self.representation.agg_states_num and self.hasTime():
            s = self.representation.stateID2state(i)
            if not self.domain.isTerminal(s) and len(self.domain.possibleActions(s)):
                for a in self.domain.possibleActions(s):
                    if not self.hasTime():
                        break
                    self.BellmanBackup(s, a, self.ns_samples, policy)
                if policy.pi(s, False, self.domain.possibleActions(s=s)) != self.representation.bestAction(s, False, self.domain.possibleActions(s=s)):
                    policyChanges += 1
            i += 1
        # This will cause the policy to be copied over
        policy.representation.weight_vec = self.representation.weight_vec.copy()
        performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun(
        )
        self.logger.info(
            'PI #%d [%s]: BellmanUpdates=%d, Policy Change=%d, Return=%0.4f, Steps=%d' % (
                self.policy_improvement_iteration,
                hhmmss(
                    deltaT(
                        self.start_time)),
                self.bellmanUpdates,
                policyChanges,
                performance_return,
                performance_steps))

        # store stats
        self.result["bellman_updates"].append(self.bellmanUpdates)
        self.result["return"].append(performance_return)
        self.result["planning_time"].append(deltaT(self.start_time))
        self.result["num_features"].append(self.representation.features_num)
        self.result["steps"].append(performance_steps)
        self.result["terminated"].append(performance_term)
        self.result["discounted_return"].append(performance_discounted_return)
        self.result["policy_improvement_iteration"].append(self.policy_improvement_iteration)
        return policy, policyChanges
    def solve(self):
        """Solve the domain MDP."""

        # Used to show the total time took the process
        self.start_time = clock()
        bellmanUpdates = 0
        converged = False
        iteration = 0
        # Track the number of consequent trajectories with very small observed
        # BellmanError
        converged_trajectories = 0
        while self.hasTime() and not converged:

            # Generate a new episode e-greedy with the current values
            max_Bellman_Error = 0
            step = 0
            terminal = False
            s, terminal, p_actions = self.domain.s0()
            a = self.representation.bestAction(
                s, terminal, p_actions
            ) if np.random.rand() > self.epsilon else randSet(p_actions)
            while not terminal and step < self.domain.episodeCap and self.hasTime(
            ):
                new_Q = self.representation.Q_oneStepLookAhead(
                    s, a, self.ns_samples)
                phi_s = self.representation.phi(s, terminal)
                phi_s_a = self.representation.phi_sa(s, terminal, a, phi_s)
                old_Q = np.dot(phi_s_a, self.representation.weight_vec)
                bellman_error = new_Q - old_Q

                # print s, old_Q, new_Q, bellman_error
                self.representation.weight_vec += self.alpha * bellman_error * phi_s_a
                bellmanUpdates += 1
                step += 1

                # Discover features if the representation has the discover method
                discover_func = getattr(
                    self.representation, 'discover', None
                )  # None is the default value if the discover is not an attribute
                if discover_func and callable(discover_func):
                    self.representation.discover(phi_s, bellman_error)

                max_Bellman_Error = max(max_Bellman_Error, abs(bellman_error))
                # Simulate new state and action on trajectory
                _, s, terminal, p_actions = self.domain.step(a)
                a = self.representation.bestAction(
                    s, terminal, p_actions
                ) if np.random.rand() > self.epsilon else randSet(p_actions)

            # check for convergence
            iteration += 1
            if max_Bellman_Error < self.convergence_threshold:
                converged_trajectories += 1
            else:
                converged_trajectories = 0
            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun(
            )
            converged = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES
            self.logger.info(
                'PI #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Return=%0.4f, Steps=%d, Features=%d'
                % (iteration, hhmmss(deltaT(self.start_time)), bellmanUpdates,
                   max_Bellman_Error, performance_return, performance_steps,
                   self.representation.features_num))
            if self.show:
                self.domain.show(a, representation=self.representation, s=s)

            # store stats
            self.result["bellman_updates"].append(bellmanUpdates)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(
                self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(
                performance_discounted_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info('Converged!')
        super(TrajectoryBasedValueIteration, self).solve()
Esempio n. 9
0
    def solve(self):
        """Solve the domain MDP."""

        self.start_time = clock()  # Used to show the total time took the process
        bellmanUpdates = 0  # used to track the performance improvement.
        converged = False
        iteration = 0

        # Check for Tabular Representation
        if not self.IsTabularRepresentation():
            self.logger.error("Value Iteration works only with a tabular representation.")
            return 0

        no_of_states = self.representation.agg_states_num

        while self.hasTime() and not converged:

            iteration += 1

            # Store the weight vector for comparison
            prev_weight_vec = self.representation.weight_vec.copy()

            # Sweep The State Space
            for i in xrange(no_of_states):

                s = self.representation.stateID2state(i)

                # Sweep through possible actions
                for a in self.domain.possibleActions(s):

                    # Check for available planning time
                    if not self.hasTime(): break

                    self.BellmanBackup(s, a, ns_samples=self.ns_samples)
                    bellmanUpdates += 1

                    # Create Log
                    if bellmanUpdates % self.log_interval == 0:
                        performance_return, _, _, _ = self.performanceRun()
                        self.logger.info(
                            '[%s]: BellmanUpdates=%d, Return=%0.4f' %
                            (hhmmss(deltaT(self.start_time)), bellmanUpdates, performance_return))

            # check for convergence
            weight_vec_change = l_norm(prev_weight_vec - self.representation.weight_vec, np.inf)
            converged = weight_vec_change < self.convergence_threshold

            # log the stats
            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun()
            self.logger.info(
                'PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, Return=%0.4f, Steps=%d' % (iteration,
                 hhmmss(deltaT(self.start_time)),
                 bellmanUpdates,
                 weight_vec_change,
                 performance_return,
                 performance_steps))

            # Show the domain and value function
            if self.show:
                self.domain.show(a, s=s, representation=self.representation)

            # store stats
            self.result["bellman_updates"].append(bellmanUpdates)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(performance_discounted_return)
            self.result["iteration"].append(iteration)

        if converged: self.logger.info('Converged!')
        super(ValueIteration, self).solve()
Esempio n. 10
0
    def solve(self):
        """Solve the domain MDP."""

        # Used to show the total time took the process
        self.start_time = clock()
        bellmanUpdates = 0
        converged = False
        iteration = 0
        # Track the number of consequent trajectories with very small observed
        # BellmanError
        converged_trajectories = 0
        while self.hasTime() and not converged:

            # Generate a new episode e-greedy with the current values
            max_Bellman_Error = 0
            step = 0
            terminal = False
            s, terminal, p_actions = self.domain.s0()
            a = self.representation.bestAction(
                s,
                terminal,
                p_actions) if np.random.rand(
            ) > self.epsilon else randSet(
                p_actions)
            while not terminal and step < self.domain.episodeCap and self.hasTime():
                new_Q = self.representation.Q_oneStepLookAhead(s, a, self.ns_samples)
                phi_s = self.representation.phi(s, terminal)
                phi_s_a = self.representation.phi_sa(s, terminal, a, phi_s)
                old_Q = np.dot(phi_s_a, self.representation.weight_vec)
                bellman_error = new_Q - old_Q

                # print s, old_Q, new_Q, bellman_error
                self.representation.weight_vec += self.alpha * bellman_error * phi_s_a
                bellmanUpdates += 1
                step += 1

                # Discover features if the representation has the discover method
                discover_func = getattr(self.representation, 'discover', None)  # None is the default value if the discover is not an attribute
                if discover_func and callable(discover_func):
                    self.representation.discover(phi_s, bellman_error)

                max_Bellman_Error = max(max_Bellman_Error, abs(bellman_error))
                # Simulate new state and action on trajectory
                _, s, terminal, p_actions = self.domain.step(a)
                a = self.representation.bestAction(s, terminal, p_actions) if np.random.rand() > self.epsilon else randSet(p_actions)

            # check for convergence
            iteration += 1
            if max_Bellman_Error < self.convergence_threshold:
                converged_trajectories += 1
            else:
                converged_trajectories = 0
            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun(
            )
            converged = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES
            self.logger.info(
                'PI #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Return=%0.4f, Steps=%d, Features=%d' % (iteration,
                                                                                                                  hhmmss(
                                                                                                                      deltaT(
                                                                                                                          self.start_time)),
                                                                                                                  bellmanUpdates,
                                                                                                                  max_Bellman_Error,
                                                                                                                  performance_return,
                                                                                                                  performance_steps,
                                                                                                                  self.representation.features_num))
            if self.show:
                self.domain.show(a, representation=self.representation, s=s)

            # store stats
            self.result["bellman_updates"].append(bellmanUpdates)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(performance_discounted_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info('Converged!')
        super(TrajectoryBasedValueIteration, self).solve()
    def run(self,
            performance_domain=None,
            visualize_performance=0,
            visualize_learning=False,
            visualize_steps=False,
            debug_on_sigurg=False):
        """
        Run the experiment and collect statistics / generate the results

        :param visualize_performance: (int)
            determines whether a visualization of the steps taken in
            performance runs are shown. 0 means no visualization is shown.
            A value n > 0 means that only the first n performance runs for a
            specific policy are shown (i.e., for n < checks_per_policy, not all
            performance runs are shown)
        :param visualize_learning: (boolean)
            show some visualization of the learning status before each
            performance evaluation (e.g. Value function)
        :param visualize_steps: (boolean)
            visualize all steps taken during learning
        :param debug_on_sigurg: (boolean)
            if true, the ipdb debugger is opened when the python process
            receives a SIGURG signal. This allows to enter a debugger at any
            time, e.g. to view data interactively or actual debugging.
            The feature works only in Unix systems. The signal can be sent
            with the kill command:

                kill -URG pid

            where pid is the process id of the python interpreter running this
            function.

        """

        if debug_on_sigurg:
            rlpy.Tools.ipshell.ipdb_on_SIGURG()

        if performance_domain == None:
            self.performance_domain = deepcopy(self.domain)
        else:
            self.performance_domain = deepcopy(performance_domain)

        self.seed_components()

        self.result = defaultdict(list)
        self.result["seed"] = self.exp_id
        total_steps = 0
        eps_steps = 0
        eps_return = 0
        episode_number = 0

        # show policy or value function of initial policy
        if visualize_learning:
            self.domain.showLearning(self.agent.representation)

        # Used to bound the number of logs in the file
        start_log_time = clock()
        # Used to show the total time took the process
        self.start_time = clock()
        self.elapsed_time = 0
        # do a first evaluation to get the quality of the inital policy
        self.all_experiment_list = []
        self.evaluate(total_steps, episode_number, visualize_performance)
        self.total_eval_time = 0.
        terminal = True
        curr_experiment_list = []
        while total_steps < self.max_steps:
            if terminal or eps_steps >= self.domain.episodeCap:
                # if curr_experiment_list!=[]:
                #     self.all_experiment_list.append(curr_experiment_list)
                curr_experiment_list = []
                s, terminal, p_actions = self.domain.s0()
                a = self.agent.policy.pi(s, terminal, p_actions)
                # Visual
                if visualize_steps:
                    self.domain.show(a, self.agent.representation)

                # Output the current status if certain amount of time has been
                # passed
                eps_return = 0
                eps_steps = 0
                episode_number += 1
            # Act,Step
            curr_experiment_list.append((str(list(s)), str(a)))
            r, ns, terminal, np_actions = self.domain.step(a)

            self._gather_transition_statistics(s, a, ns, r, learning=True)
            na = self.agent.policy.pi(ns, terminal, np_actions)

            total_steps += 1
            eps_steps += 1
            eps_return += r

            # Print Current performance
            if (terminal or eps_steps == self.domain.episodeCap
                ) and deltaT(start_log_time) > self.log_interval:
                start_log_time = clock()
                elapsedTime = deltaT(self.start_time)
                self.logger.info(
                    self.log_template.format(
                        total_steps=total_steps,
                        elapsed=hhmmss(elapsedTime),
                        remaining=hhmmss(elapsedTime *
                                         (self.max_steps - total_steps) /
                                         total_steps),
                        totreturn=eps_return,
                        steps=eps_steps,
                        num_feat=self.agent.representation.features_num))

            # learning
            self.agent.learn(s, p_actions, a, r, ns, np_actions, na, terminal)
            s, a, p_actions = ns, na, np_actions
            # Visual
            if visualize_steps:
                self.domain.show(a, self.agent.representation)

            # Check Performance
            if total_steps % (self.max_steps / self.num_policy_checks) == 0:
                self.elapsed_time = deltaT(
                    self.start_time) - self.total_eval_time

                # show policy or value function
                if visualize_learning:
                    self.domain.showLearning(self.agent.representation)

                self.evaluate(total_steps, episode_number,
                              visualize_performance)
                self.total_eval_time += deltaT(self.start_time) - \
                    self.elapsed_time - \
                    self.total_eval_time
                start_log_time = clock()

        # Visual
        if visualize_steps:
            self.domain.show(a, self.agent.representation)
        self.logger.info("Total Experiment Duration %s" %
                         (hhmmss(deltaT(self.start_time))))
Esempio n. 12
0
    def evaluate(self, total_steps, episode_number, visualize=0):
        """
        Evaluate the current agent within an experiment

        :param total_steps: (int)
                     number of steps used in learning so far
        :param episode_number: (int)
                        number of episodes used in learning so far
        """
        print "Stepsize: %f" % self.agent.learn_rate
        np.set_printoptions(formatter={'float': '{: 0.3f}'.format})

        random_state = np.random.get_state()
        #random_state_domain = copy(self.domain.random_state)
        elapsedTime = deltaT(self.start_time)
        performance_return = 0.
        performance_steps = 0.
        performance_term = 0.
        performance_discounted_return = 0.
        performance_return_squared = 0.
        for j in xrange(self.checks_per_policy):

            p_ret, p_step, p_term, p_dret = self.performanceRun(
                total_steps, visualize=visualize > j)
            print j, p_ret
            performance_return += p_ret
            performance_steps += p_step
            performance_term += p_term
            performance_discounted_return += p_dret
            performance_return_squared += p_ret**2
        performance_return /= self.checks_per_policy
        performance_return_squared /= self.checks_per_policy
        performance_steps /= self.checks_per_policy
        performance_term /= self.checks_per_policy
        performance_discounted_return /= self.checks_per_policy

        std_return = np.sqrt(performance_return_squared -
                             (performance_return)**2)

        self.result["learning_steps"].append(total_steps)
        self.result["return"].append(performance_return)
        self.result["return_std"].append(std_return)
        self.result["learning_time"].append(self.elapsed_time)
        self.result["num_features"].append(
            self.agent.representation.features_num)
        self.result["steps"].append(performance_steps)
        self.result["terminated"].append(performance_term)
        self.result["learning_episode"].append(episode_number)
        self.result["discounted_return"].append(performance_discounted_return)
        # reset start time such that performanceRuns don't count
        self.start_time = clock() - elapsedTime
        if total_steps > 0:
            remaining = hhmmss(elapsedTime * (self.max_steps - total_steps) /
                               total_steps)
        else:
            remaining = "?"
        self.logger.info(
            self.performance_log_template.format(
                episode=episode_number,
                total_steps=total_steps,
                # elapsed=hhmmss(
                #     elapsedTime),
                # remaining=remaining,
                totreturn=performance_return,
                stdreturn=std_return,  #TODO
                steps=performance_steps,
                num_feat=self.agent.representation.features_num))

        np.random.set_state(random_state)
    def solveInMatrixFormat(self):
        # while delta_weight_vec > threshold
        #  1. Gather data following an e-greedy policy
        #  2. Calculate A and b estimates
        #  3. calculate new_weight_vec, and delta_weight_vec
        # return policy greedy w.r.t last weight_vec
        self.policy = eGreedy(
            self.representation,
            epsilon=self.epsilon)

        # Number of samples to be used for each policy evaluation phase. L1 in
        # the Geramifard et. al. FTML 2012 paper
        self.samples_num = 1000

        self.start_time = clock()  # Used to track the total time for solving
        samples = 0
        converged = False
        iteration = 0
        while self.hasTime() and not converged:

            #  1. Gather samples following an e-greedy policy
            S, Actions, NS, R, T = self.policy.collectSamples(self.samples_num)
            samples += self.samples_num

            #  2. Calculate A and b estimates
            a_num = self.domain.actions_num
            n = self.representation.features_num
            discount_factor = self.domain.discount_factor

            self.A = np.zeros((n * a_num, n * a_num))
            self.b = np.zeros((n * a_num, 1))
            for i in xrange(self.samples_num):
                phi_s_a = self.representation.phi_sa(
                    S[i], Actions[i, 0]).reshape((-1, 1))
                E_phi_ns_na = self.calculate_expected_phi_ns_na(
                    S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1))
                d = phi_s_a - discount_factor * E_phi_ns_na
                self.A += np.outer(phi_s_a, d.T)
                self.b += phi_s_a * R[i, 0]

            #  3. calculate new_weight_vec, and delta_weight_vec
            new_weight_vec, solve_time = solveLinear(regularize(self.A), self.b)
            iteration += 1
            if solve_time > 1:
                self.logger.info(
                    '#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)' %
                    (iteration, solve_time))
            delta_weight_vec = l_norm(new_weight_vec - self.representation.weight_vec, np.inf)
            converged = delta_weight_vec < self.convergence_threshold
            self.representation.weight_vec = new_weight_vec
            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun()
            self.logger.info(
                '#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f' %
                (iteration, hhmmss(deltaT(self.start_time)), samples, delta_weight_vec, performance_return))
            if self.show:
                self.domain.show(S[-1], Actions[-1], self.representation)

            # store stats
            self.result["samples"].append(samples)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(performance_discounted_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info('Converged!')

        super(TrajectoryBasedPolicyIteration, self).solve()
Esempio n. 14
0
    def policyEvaluation(self, policy):

        '''
        Evaluate a given policy: this is done by applying the Bellman backup over all states until the change is less than
        a given threshold.

        Returns: convergence status as a boolean
        '''
        converged = False
        policy_evaluation_iteration = 0
        while (not converged and
                self.hasTime() and
                policy_evaluation_iteration < self.max_PE_iterations
                ):
            policy_evaluation_iteration += 1

            # Sweep The State Space
            for i in range(0, self.representation.agg_states_num):

                # Check for solver time
                if not self.hasTime(): break

                # Map an state ID to state
                s = self.representation.stateID2state(i)

                # Skip terminal states and states with no possible action
                possible_actions = self.domain.possibleActions(s=s)
                if (self.domain.isTerminal(s) or
                    len(possible_actions) == 0):
                    continue

                # Apply Bellman Backup
                self.BellmanBackup(
                    s,
                    policy.pi(s, False, possible_actions),
                    self.ns_samples,
                    policy)

                # Update number of backups
                self.bellmanUpdates += 1

                # Check for the performance
                if self.bellmanUpdates % self.log_interval == 0:
                    performance_return = self.performanceRun()[0]
                    self.logger.info(
                        '[%s]: BellmanUpdates=%d, Return=%0.4f' %
                        (hhmmss(deltaT(self.start_time)), self.bellmanUpdates, performance_return))

            # check for convergence: L_infinity norm of the difference between the to the weight vector of representation
            weight_vec_change = l_norm(policy.representation.weight_vec - self.representation.weight_vec, np.inf)
            converged = weight_vec_change < self.convergence_threshold

            # Log Status
            self.logger.info(
                'PE #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f' %
                (policy_evaluation_iteration, hhmmss(deltaT(self.start_time)), self.bellmanUpdates, weight_vec_change))

            # Show Plots
            if self.show:
                self.domain.show(
                                 policy.pi(s, False, possible_actions),
                                 self.representation,
                                 s=s)
        return converged
    def trajectoryBasedPolicyEvaluation(self, policy):
        ''' evaluate the current policy by simulating trajectories and update the value function along the
        visited states
        '''
        PE_iteration = 0
        evaluation_is_accurate = False
        converged_trajectories = 0
        while not evaluation_is_accurate and self.hasTime() and PE_iteration < self.max_PE_iterations:

            # Generate a new episode e-greedy with the current values
            max_Bellman_Error = 0
            step = 0

            s, a, terminal = self.sample_ns_na(policy, start_trajectory=True)

            while not terminal and step < self.domain.episodeCap and self.hasTime():
                new_Q = self.representation.Q_oneStepLookAhead(s, a, self.ns_samples, policy)
                phi_s = self.representation.phi(s, terminal)
                phi_s_a = self.representation.phi_sa(s, terminal, a, phi_s=phi_s)
                old_Q = np.dot(phi_s_a, self.representation.weight_vec)
                bellman_error = new_Q - old_Q

                # Update the value function using approximate bellman backup
                self.representation.weight_vec += (self.alpha * bellman_error * phi_s_a)
                self.bellmanUpdates += 1
                step += 1
                max_Bellman_Error = max(max_Bellman_Error, abs(bellman_error))

                # Discover features if the representation has the discover method
                discover_func = getattr(self.representation, 'discover', None)  # None is the default value if the discover is not an attribute
                if discover_func and callable(discover_func):
                    self.representation.post_discover(phi_s, bellman_error)
                    # if discovered:
                    # print "Features = %d" % self.representation.features_num

                s, a, terminal = self.sample_ns_na(policy, a)

            # check for convergence of policy evaluation
            PE_iteration += 1
            if max_Bellman_Error < self.convergence_threshold:
                converged_trajectories += 1
            else:
                converged_trajectories = 0
            evaluation_is_accurate = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES
            self.logger.info(
                'PE #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Features=%d' % (PE_iteration,
                                                                                          hhmmss(
                                                                                              deltaT(
                                                                                                  self.start_time)),
                                                                                          self. bellmanUpdates,
                                                                                          max_Bellman_Error,
                                                                                          self.representation.features_num))
    def solve(self):
        """Solve the domain MDP."""

        self.start_time = clock()  # Used to track the total time for solving
        self.bellmanUpdates = 0
        converged = False
        PI_iteration = 0

        # The policy is maintained as separate copy of the representation.
        # This way as the representation is updated the policy remains intact
        policy = eGreedy(
            deepcopy(self.representation),
            epsilon=0,
            forcedDeterministicAmongBestActions=True)

        while self.hasTime() and not converged:

            self.trajectoryBasedPolicyEvaluation(policy)

            # Policy Improvement (Updating the representation of the value
            # function will automatically improve the policy
            PI_iteration += 1

            # Theta can increase in size if the representation is expanded hence padding the weight vector with zeros
            paddedTheta = padZeros(policy.representation.weight_vec, len(self.representation.weight_vec))

            # Calculate the change in the weight_vec as L2-norm
            delta_weight_vec = np.linalg.norm(paddedTheta - self.representation.weight_vec)
            converged = delta_weight_vec < self.convergence_threshold

            # Update the underlying value function of the policy
            policy.representation = deepcopy(self.representation)  # self.representation

            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun()
            self.logger.info(
                'PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, Return=%0.3f, steps=%d, features=%d' % (PI_iteration,
                                                                                                                hhmmss(
                                                                                                                    deltaT(
                                                                                                                        self.start_time)),
                                                                                                                self.bellmanUpdates,
                                                                                                                delta_weight_vec,
                                                                                                                performance_return,
                                                                                                                performance_steps,
                                                                                                                self.representation.features_num))
            if self.show:
                self.domain.show(a, representation=self.representation, s=s)

            # store stats
            self.result["bellman_updates"].append(self.bellmanUpdates)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(performance_discounted_return)
            self.result["policy_improvemnt_iteration"].append(PI_iteration)

        if converged:
            self.logger.info('Converged!')
        super(TrajectoryBasedPolicyIteration, self).solve()
    def solveInMatrixFormat(self):
        # while delta_weight_vec > threshold
        #  1. Gather data following an e-greedy policy
        #  2. Calculate A and b estimates
        #  3. calculate new_weight_vec, and delta_weight_vec
        # return policy greedy w.r.t last weight_vec
        self.policy = eGreedy(self.representation, epsilon=self.epsilon)

        # Number of samples to be used for each policy evaluation phase. L1 in
        # the Geramifard et. al. FTML 2012 paper
        self.samples_num = 1000

        self.start_time = clock()  # Used to track the total time for solving
        samples = 0
        converged = False
        iteration = 0
        while self.hasTime() and not converged:

            #  1. Gather samples following an e-greedy policy
            S, Actions, NS, R, T = self.collectSamples(self.samples_num)
            samples += self.samples_num

            #  2. Calculate A and b estimates
            a_num = self.domain.actions_num
            n = self.representation.features_num
            discount_factor = self.domain.discount_factor

            self.A = np.zeros((n * a_num, n * a_num))
            self.b = np.zeros((n * a_num, 1))
            for i in xrange(self.samples_num):
                phi_s_a = self.representation.phi_sa(S[i], T[i],
                                                     Actions[i, 0]).reshape(
                                                         (-1, 1))
                E_phi_ns_na = self.calculate_expected_phi_ns_na(
                    S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1))
                d = phi_s_a - discount_factor * E_phi_ns_na
                self.A += np.outer(phi_s_a, d.T)
                self.b += phi_s_a * R[i, 0]

            #  3. calculate new_weight_vec, and delta_weight_vec
            new_weight_vec, solve_time = solveLinear(regularize(self.A),
                                                     self.b)
            iteration += 1
            if solve_time > 1:
                self.logger.info(
                    '#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)' %
                    (iteration, solve_time))
            delta_weight_vec = l_norm(
                new_weight_vec - self.representation.weight_vec, np.inf)
            converged = delta_weight_vec < self.convergence_threshold
            self.representation.weight_vec = new_weight_vec
            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun(
            )
            self.logger.info(
                '#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f'
                % (iteration, hhmmss(deltaT(self.start_time)), samples,
                   delta_weight_vec, performance_return))
            if self.show:
                self.domain.show(S[-1], Actions[-1], self.representation)

            # store stats
            self.result["samples"].append(samples)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(
                self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(
                performance_discounted_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info('Converged!')

        super(TrajectoryBasedPolicyIteration, self).solve()
Esempio n. 18
0
    def run(self, visualize_performance=0, visualize_learning=False,
            visualize_steps=False, debug_on_sigurg=False):
        """
        Run the experiment and collect statistics / generate the results

        :param visualize_performance: (int)
            determines whether a visualization of the steps taken in
            performance runs are shown. 0 means no visualization is shown.
            A value n > 0 means that only the first n performance runs for a
            specific policy are shown (i.e., for n < checks_per_policy, not all
            performance runs are shown)
        :param visualize_learning: (boolean)
            show some visualization of the learning status before each
            performance evaluation (e.g. Value function)
        :param visualize_steps: (boolean)
            visualize all steps taken during learning
        :param debug_on_sigurg: (boolean)
            if true, the ipdb debugger is opened when the python process
            receives a SIGURG signal. This allows to enter a debugger at any
            time, e.g. to view data interactively or actual debugging.
            The feature works only in Unix systems. The signal can be sent
            with the kill command:

                kill -URG pid

            where pid is the process id of the python interpreter running this
            function.

        """

        if debug_on_sigurg:
            rlpy.Tools.ipshell.ipdb_on_SIGURG()
        self.performance_domain = deepcopy(self.domain)
        self.seed_components()

        self.result = defaultdict(list)
        self.result["seed"] = self.exp_id
        total_steps = 0
        eps_steps = 0
        eps_return = 0
        episode_number = 0

        # show policy or value function of initial policy
        if visualize_learning:
            self.domain.showLearning(self.agent.representation)

        # Used to bound the number of logs in the file
        start_log_time = clock()
        # Used to show the total time took the process
        self.start_time = clock()
        self.elapsed_time = 0
        # do a first evaluation to get the quality of the inital policy
        self.evaluate(total_steps, episode_number, visualize_performance)
        self.total_eval_time = 0.
        terminal = True
        while total_steps < self.max_steps:
            if terminal or eps_steps >= self.domain.episodeCap:
                s, terminal, p_actions = self.domain.s0()
                a = self.agent.policy.pi(s, terminal, p_actions)
                # Visual
                if visualize_steps:
                    self.domain.show(a, self.agent.representation)

                # Output the current status if certain amount of time has been
                # passed
                eps_return = 0
                eps_steps = 0
                episode_number += 1
            # Act,Step
            r, ns, terminal, np_actions = self.domain.step(a)

            self._gather_transition_statistics(s, a, ns, r, learning=True)
            na = self.agent.policy.pi(ns, terminal, np_actions)

            total_steps += 1
            eps_steps += 1
            eps_return += r

            # Print Current performance
            if (terminal or eps_steps == self.domain.episodeCap) and deltaT(start_log_time) > self.log_interval:
                start_log_time = clock()
                elapsedTime = deltaT(self.start_time)
                self.logger.info(
                    self.log_template.format(total_steps=total_steps,
                                             elapsed=hhmmss(
                                                 elapsedTime),
                                             remaining=hhmmss(
                                                 elapsedTime * (
                                                     self.max_steps - total_steps) / total_steps),
                                             totreturn=eps_return,
                                             steps=eps_steps,
                                             num_feat=self.agent.representation.features_num))

            # learning
            self.agent.learn(s, p_actions, a, r, ns, np_actions, na, terminal)
            s, a, p_actions = ns, na, np_actions
            # Visual
            if visualize_steps:
                self.domain.show(a, self.agent.representation)

            # Check Performance
            if total_steps % (self.max_steps / self.num_policy_checks) == 0:
                self.elapsed_time = deltaT(
                    self.start_time) - self.total_eval_time

                # show policy or value function
                if visualize_learning:
                    self.domain.showLearning(self.agent.representation)

                self.evaluate(
                    total_steps,
                    episode_number,
                    visualize_performance)
                self.total_eval_time += deltaT(self.start_time) - \
                    self.elapsed_time - \
                    self.total_eval_time
                start_log_time = clock()

        # Visual
        if visualize_steps:
            self.domain.show(a, self.agent.representation)
        self.logger.info("Total Experiment Duration %s" % (hhmmss(deltaT(self.start_time))))
Esempio n. 19
0
    def run(self,
            visualize_performance=0,
            visualize_learning=False,
            visualize_steps=False,
            debug_on_sigurg=False):
        """
        Run the experiment and collect statistics / generate the results

        :param visualize_performance: (int)
            determines whether a visualization of the steps taken in
            performance runs are shown. 0 means no visualization is shown.
            A value n > 0 means that only the first n performance runs for a
            specific policy are shown (i.e., for n < checks_per_policy, not all
            performance runs are shown)
        :param visualize_learning: (boolean)
            show some visualization of the learning status before each
            performance evaluation (e.g. Value function)
        :param visualize_steps: (boolean)
            visualize all steps taken during learning
        :param debug_on_sigurg: (boolean)
            if true, the ipdb debugger is opened when the python process
            receives a SIGURG signal. This allows to enter a debugger at any
            time, e.g. to view data interactively or actual debugging.
            The feature works only in Unix systems. The signal can be sent
            with the kill command:

                kill -URG pid

            where pid is the process id of the python interpreter running this
            function.

        """

        if debug_on_sigurg:
            rlpy.Tools.ipshell.ipdb_on_SIGURG()
        self.performance_domain = deepcopy(self.domain)
        self.seed_components()

        self.result = defaultdict(list)
        self.result["seed"] = self.exp_id

        self.trials = defaultdict(list)
        self.trials["seed"] = self.exp_id

        total_steps = 0
        eps_steps = 0
        eps_return = 0
        episode_number = 0

        # show policy or value function of initial policy
        if visualize_learning:
            self.domain.showLearning(self.agent.representation)

        # Used to bound the number of logs in the file
        start_log_time = clock()
        # Used to show the total time took the process
        self.start_time = clock()
        self.elapsed_time = 0
        # do a first evaluation to get the quality of the inital policy
        self.evaluate(total_steps, episode_number, visualize_performance)
        self.total_eval_time = 0.
        terminal = True
        # while total_steps < self.max_steps:
        while episode_number < self.max_episode:
            if terminal or eps_steps >= self.domain.episodeCap:
                counter = defaultdict(int)
                # if len(self.agent.policy._trajectory['options']):
                #     sames = [max(Counter(x).values()) for x in self.agent.policy._trajectory['options']]
                #     print "Variance of unanimous choices: {}".format(np.mean(sames))
                self.agent.policy._trajectory = defaultdict(list)
                s, terminal, p_actions = self.domain.s0()
                # import ipdb; ipdb.set_trace()
                a = self.agent.policy.pi(s, terminal, p_actions)
                # Visual
                if visualize_steps:
                    self.domain.show(a, self.agent.representation)

                # Output the current status if certain amount of time has been
                # passed
                eps_return = 0
                eps_steps = 0
                episode_number += 1
            # Act,Step
            r, ns, terminal, np_actions = self.domain.step(a)
            counter[a] += 1
            # print "Next state: (%0.3f, %0.3f, %0.3f, %0.3f) : %0.5f," % (ns[0], ns[1], ns[2], ns[3], self.agent.representation.V(ns, terminal, np_actions))
            # if any(self.agent.representation.weight_vec > 0):
            #     wns = np.argmax(self.agent.representation.weight_vec)
            #     print self.agent.representation.weight_vec[wns]

            self.logger.debug(s, self.agent.representation.Qs(s, False))
            # print ns, self.agent.representation.Qs(ns, False)
            # print "*" * 10

            # self._gather_transition_statistics(s, a, ns, r, learning=True)
            na = self.agent.policy.pi(ns, terminal, np_actions)

            total_steps += 1
            eps_steps += 1
            eps_return += r

            # if total_steps == 60000:

            # Print Current performance
            if (terminal or eps_steps == self.domain.episodeCap):
                self.trials["learning_steps"].append(total_steps)
                self.trials["eps_steps"].append(eps_steps)
                self.trials["return"].append(eps_return)
                # print episode_number, eps_return
                self.trials["num_feat"].append(
                    self.agent.representation.features_num)
                self.trials["learning_episode"].append(episode_number)
                self.trials["Action_0"].append(float(counter[0]) / eps_steps)
                self.trials["Action_1"].append(float(counter[1]) / eps_steps)
                if self.best_return is not None:
                    self.trials["best_return"].append(self.best_return)

                self.update_best_representation(total_steps, episode_number,
                                                visualize_performance)
                # Check Performance
                if episode_number % (self.max_episode /
                                     self.num_policy_checks) == 0:
                    self.elapsed_time = deltaT(
                        self.start_time) - self.total_eval_time

                    # show policy or value function
                    if visualize_learning:
                        self.domain.showLearning(self.agent.representation)

                    # self.agent.policy.turn_on_printing()

                    self.evaluate(total_steps, episode_number,
                                  visualize_performance)

                    # self.agent.policy.turn_off_printing()
                    self.total_eval_time += deltaT(self.start_time) - \
                        self.elapsed_time - \
                        self.total_eval_time
                    start_log_time = clock()

            self.agent.learn(s, p_actions, a, r, ns, np_actions, na, terminal)
            s, a, p_actions = ns, na, np_actions
            # Visual
            if visualize_steps:
                self.domain.show(a, self.agent.representation)

        # Visual
        if visualize_steps:
            self.domain.show(a, self.agent.representation)
        self.logger.info("Total Experiment Duration %s" %
                         (hhmmss(deltaT(self.start_time))))