Python deltaTの例、rlpy.tools.deltaT Pythonの例

コード例 #1

0

ファイルを表示

ファイル: lspi.py プロジェクト: kngwyu/rlpy3

    def LSTD(self):
        """Run the LSTD algorithm on the collected data, and update the
        policy parameters.
        """
        start_time = tools.clock()

        if not self.fixed_rep:
            # build phi_s and phi_ns for all samples
            p = self.samples_count
            n = self.representation.features_num
            self.all_phi_s = np.empty((p, n),
                                      dtype=self.representation.feature_type())
            self.all_phi_ns = np.empty(
                (p, n), dtype=self.representation.feature_type())

            for i in np.arange(self.samples_count):
                self.all_phi_s[i, :] = self.representation.phi(self.data_s[i])
                self.all_phi_ns[i, :] = self.representation.phi(
                    self.data_ns[i])

            # build phi_s_a and phi_ns_na for all samples given phi_s and
            # phi_ns
            self.all_phi_s_a = self.representation.batch_phi_sa(
                self.all_phi_s[:self.samples_count, :],
                self.data_a[:self.samples_count, :],
                use_sparse=self.use_sparse,
            )
            self.all_phi_ns_na = self.representation.batch_phi_sa(
                self.all_phi_ns[:self.samples_count, :],
                self.data_na[:self.samples_count, :],
                use_sparse=self.use_sparse,
            )

            # calculate A and b for LSTD
            F1 = self.all_phi_s_a[:self.samples_count, :]
            F2 = self.all_phi_ns_na[:self.samples_count, :]
            R = self.data_r[:self.samples_count, :]
            discount_factor = self.discount_factor

            if self.use_sparse:
                self.b = (F1.T * R).reshape(-1, 1)
                self.A = F1.T * (F1 - discount_factor * F2)
            else:
                self.b = np.dot(F1.T, R).reshape(-1, 1)
                self.A = np.dot(F1.T, F1 - discount_factor * F2)

        A = tools.regularize(self.A)

        # Calculate weight_vec
        self.representation.weight_vec, solve_time = tools.solveLinear(
            A, self.b)

        # log solve time only if takes more than 1 second
        if solve_time > 1:
            self.logger.info(
                "Total LSTD Time = %0.0f(s), Solve Time = %0.0f(s)" %
                (tools.deltaT(start_time), solve_time))
        else:
            self.logger.info("Total LSTD Time = %0.0f(s)" %
                             (tools.deltaT(start_time)))

コード例 #2

0

ファイルを表示

ファイル: policy_iteration.py プロジェクト: kngwyu/rlpy3

    def policy_improvement(self, policy):
        """
        Given a policy improve it by taking the greedy action in each state based
        on the value function.
        Returns the new policy.
        """
        policyChanges = 0
        i = 0
        while i < self.representation.num_states_total and self.has_time():
            s = self.representation.stateID2state(i)
            p_actions = self.domain.possible_actions(s)
            if not self.domain.is_terminal(s) and len(
                    self.domain.possible_actions(s)):
                for a in self.domain.possible_actions(s):
                    self.bellman_backup(s, a, self.ns_samples, policy)
                p_actions = self.domain.possible_actions(s=s)
                best_action = self.representation.best_action(
                    s, False, p_actions)
                if policy.pi(s, False, p_actions) != best_action:
                    policyChanges += 1
            i += 1

        # This will cause the policy to be copied over
        policy.representation.weight = self.representation.weight.copy()
        perf_return, perf_steps, perf_term, perf_disc_return = self.performance_run(
        )
        self.logger.info(
            "PI #%d [%s]: BellmanUpdates=%d, Policy Change=%d, Return=%0.4f, Steps=%d"
            % (
                self.policy_improvement_iteration,
                hhmmss(deltaT(self.start_time)),
                self.bellman_updates,
                policyChanges,
                perf_return,
                perf_steps,
            ))

        # store stats
        self.result["bellman_updates"].append(self.bellman_updates)
        self.result["return"].append(perf_return)
        self.result["planning_time"].append(deltaT(self.start_time))
        self.result["num_features"].append(self.representation.features_num)
        self.result["steps"].append(perf_steps)
        self.result["terminated"].append(perf_term)
        self.result["discounted_return"].append(perf_disc_return)
        self.result["policy_improvement_iteration"].append(
            self.policy_improvement_iteration)
        return policy, policyChanges

コード例 #3

0

ファイルを表示

    def evaluate(self, total_steps, episode_number, visualize=0):
        """
        Evaluate the current agent within an experiment

        :param total_steps: (int)
                     number of steps used in learning so far
        :param episode_number: (int)
                        number of episodes used in learning so far
        """
        random_state = np.random.get_state()
        # random_state_domain = copy(self.domain.random_state)
        elapsedTime = deltaT(self.start_time)
        performance_return = 0.0
        performance_steps = 0.0
        performance_term = 0.0
        performance_discounted_return = 0.0
        for j in range(self.checks_per_policy):
            p_ret, p_step, p_term, p_dret = self.performance_run(
                total_steps, visualize=visualize > j)
            performance_return += p_ret
            performance_steps += p_step
            performance_term += p_term
            performance_discounted_return += p_dret
        performance_return /= self.checks_per_policy
        performance_steps /= self.checks_per_policy
        performance_term /= self.checks_per_policy
        performance_discounted_return /= self.checks_per_policy
        self.result["learning_steps"].append(total_steps)
        self.result["return"].append(performance_return)
        self.result["learning_time"].append(self.elapsed_time)
        self.result["num_features"].append(
            self.agent.representation.features_num)
        self.result["steps"].append(performance_steps)
        self.result["terminated"].append(performance_term)
        self.result["learning_episode"].append(episode_number)
        self.result["discounted_return"].append(performance_discounted_return)
        # reset start time such that performanceRuns don't count
        self.start_time = clock() - elapsedTime
        if total_steps > 0:
            remaining = hhmmss(elapsedTime * (self.max_steps - total_steps) /
                               total_steps)
        else:
            remaining = "?"
        self.logger.info(
            self.performance_log_template.format(
                total_steps=total_steps,
                elapsed=hhmmss(elapsedTime),
                remaining=remaining,
                totreturn=performance_return,
                steps=performance_steps,
                num_feat=self.agent.representation.features_num,
            ))

        np.random.set_state(random_state)

コード例 #4

0

ファイルを表示

ファイル: trajectory_based_policy_iteration.py プロジェクト: kngwyu/rlpy3

    def traj_based_policy_evaluation(self, policy):
        """
        Evaluate the current policy by simulating trajectories and update
        the value function along the visited states.
        """
        PE_iteration = 0
        evaluation_is_accurate = False
        converged_trajectories = 0
        while (not evaluation_is_accurate and self.has_time()
               and PE_iteration < self.max_pe_iterations):

            # Generate a new episode e-greedy with the current values
            max_bellman_error = 0
            step = 0

            s, a, terminal = self.sample_ns_na(policy, start_trajectory=True)

            while not terminal and step < self.domain.episode_cap and self.has_time(
            ):
                bellman_error, phi_s, phi_s_a = self._bellman_error(
                    s, a, terminal)

                # Update the value function using approximate bellman backup
                self.representation.weight_vec += self.alpha * bellman_error * phi_s_a
                self.bellman_updates += 1
                step += 1
                max_bellman_error = max(max_bellman_error, abs(bellman_error))

                # Discover features if the representation has the discover method
                if hasattr(self.representation, "discover"):
                    self.representation.post_discover(phi_s, bellman_error)

                s, a, terminal = self.sample_ns_na(policy, a)

            # check for convergence of policy evaluation
            PE_iteration += 1
            if max_bellman_error < self.convergence_threshold:
                converged_trajectories += 1
            else:
                converged_trajectories = 0
            evaluation_is_accurate = (converged_trajectories >=
                                      self.MIN_CONVERGED_TRAJECTORIES)
            self.logger.info(
                "PE #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Features=%d"
                % (
                    PE_iteration,
                    hhmmss(deltaT(self.start_time)),
                    self.bellman_updates,
                    max_bellman_error,
                    self.representation.features_num,
                ))

コード例 #5

0

ファイルを表示

ファイル: lspi.py プロジェクト: kngwyu/rlpy3

    def policyIteration(self):
        """Update the policy by recalculating A based on new na.

        Returns the TD error for each sample based on the latest weights and
        next actions.
        """
        start_time = tools.clock()
        weight_diff = self.tol_epsilon + 1  # So that the loop starts
        lspi_iteration = 0
        self.best_performance = -np.inf
        self.logger.info("Running Policy Iteration:")

        # We save action_mask on the first iteration (used for batch_best_action) to
        # reuse it and boost the speed
        # action_mask is a matrix that shows which actions are available for each state
        action_mask = None
        discount_factor = self.discount_factor
        F1 = (sp.csr_matrix(self.all_phi_s_a[:self.samples_count, :])
              if self.use_sparse else self.all_phi_s_a[:self.samples_count, :])
        while lspi_iteration < self.lspi_iterations and weight_diff > self.tol_epsilon:

            # Find the best action for each state given the current value function
            # Notice if actions have the same value the first action is
            # selected in the batch mode
            iteration_start_time = tools.clock()
            (
                best_action,
                self.all_phi_ns_new_na,
                action_mask,
            ) = self.representation.batch_best_action(
                self.data_ns[:self.samples_count, :],
                self.all_phi_ns,
                action_mask,
                self.use_sparse,
            )

            # Recalculate A matrix (b remains the same)
            # Solve for the new weight_vec
            if self.use_sparse:
                F2 = sp.csr_matrix(
                    self.all_phi_ns_new_na[:self.samples_count, :])
                A = F1.T * (F1 - discount_factor * F2)
            else:
                F2 = self.all_phi_ns_new_na[:self.samples_count, :]
                A = np.dot(F1.T, F1 - discount_factor * F2)

            A = tools.regularize(A)
            new_weight_vec, solve_time = tools.solveLinear(A, self.b)

            # Calculate TD_Errors
            ####################
            td_errors = self.calculateTDErrors()

            # Calculate the weight difference. If it is big enough update the
            # weight_vec
            weight_diff = np.linalg.norm(self.representation.weight_vec -
                                         new_weight_vec)
            if weight_diff > self.tol_epsilon:
                self.representation.weight_vec = new_weight_vec

            self.logger.info(
                "%d: %0.0f(s), ||w1-w2|| = %0.4f, Sparsity=%0.1f%%, %d Features"
                % (
                    lspi_iteration + 1,
                    tools.deltaT(iteration_start_time),
                    weight_diff,
                    tools.sparsity(A),
                    self.representation.features_num,
                ))
            lspi_iteration += 1

        self.logger.info("Total Policy Iteration Time = %0.0f(s)" %
                         tools.deltaT(start_time))
        return td_errors

コード例 #6

0

ファイルを表示

    def run(self,
            visualize_performance=0,
            visualize_learning=False,
            visualize_steps=False):
        """
        Run the experiment and collect statistics / generate the results

        :param visualize_performance: (int)
            determines whether a visualization of the steps taken in
            performance runs are shown. 0 means no visualization is shown.
            A value n > 0 means that only the first n performance runs for a
            specific policy are shown (i.e., for n < checks_per_policy, not all
            performance runs are shown)
        :param visualize_learning: (boolean)
            show some visualization of the learning status before each
            performance evaluation (e.g. Value function)
        :param visualize_steps: (boolean)
            visualize all steps taken during learning
        """
        self.performance_domain = deepcopy(self.domain)
        self.performance_domain.performance = True
        self.seed_components()

        self.result = defaultdict(list)
        self.result["seed"] = self.exp_id
        total_steps = 0
        eps_steps = 0
        eps_return = 0
        episode_number = 0

        # show policy or value function of initial policy
        if visualize_learning:
            self.domain.show_learning(self.agent.representation)

        # Used to bound the number of logs in the file
        start_log_time = clock()
        # Used to show the total time took the process
        self.start_time = clock()
        self.elapsed_time = 0
        # do a first evaluation to get the quality of the inital policy
        self.evaluate(total_steps, episode_number, visualize_performance)
        self.total_eval_time = 0.0
        terminal = True
        while total_steps < self.max_steps:
            if terminal or eps_steps >= self.domain.episode_cap:
                s, terminal, p_actions = self.domain.s0()
                a = self.agent.policy.pi(s, terminal, p_actions)
                # Visual
                if visualize_steps:
                    self.domain.show(a, self.agent.representation)

                # Output the current status if certain amount of time has been
                # passed
                eps_return = 0
                eps_steps = 0
                episode_number += 1
            # Act,Step
            r, ns, terminal, np_actions = self.domain.step(a)

            self._gather_transition_statistics(s, a, ns, r, learning=True)
            na = self.agent.policy.pi(ns, terminal, np_actions)

            total_steps += 1
            eps_steps += 1
            eps_return += r

            # Print Current performance
            if (terminal or eps_steps == self.domain.episode_cap
                ) and deltaT(start_log_time) > self.log_interval:
                start_log_time = clock()
                elapsedTime = deltaT(self.start_time)
                self.logger.info(
                    self.log_template.format(
                        total_steps=total_steps,
                        elapsed=hhmmss(elapsedTime),
                        remaining=hhmmss(elapsedTime *
                                         (self.max_steps - total_steps) /
                                         total_steps),
                        totreturn=eps_return,
                        steps=eps_steps,
                        num_feat=self.agent.representation.features_num,
                    ))

            # learning
            self.agent.learn(s, p_actions, a, r, ns, np_actions, na, terminal)
            s, a, p_actions = ns, na, np_actions
            # Visual
            if visualize_steps:
                self.domain.show(a, self.agent.representation)

            # Check Performance
            if total_steps % (self.max_steps // self.num_policy_checks) == 0:
                self.elapsed_time = deltaT(
                    self.start_time) - self.total_eval_time

                # show policy or value function
                if visualize_learning:
                    self.domain.show_learning(self.agent.representation)

                self.evaluate(total_steps, episode_number,
                              visualize_performance)
                self.total_eval_time += (deltaT(self.start_time) -
                                         self.elapsed_time -
                                         self.total_eval_time)
                start_log_time = clock()

        # Visual
        if visualize_steps:
            self.domain.show(a, self.agent.representation)
        self.logger.info("Total Experiment Duration %s" %
                         (hhmmss(deltaT(self.start_time))))

コード例 #7

0

ファイルを表示

ファイル: mdp_solver.py プロジェクト: kngwyu/rlpy3

 def has_time(self):
     """Return a boolean stating if there is time left for planning."""
     return deltaT(self.start_time) < self.planning_time

コード例 #8

0

ファイルを表示

ファイル: trajectory_based_policy_iteration.py プロジェクト: kngwyu/rlpy3

    def solve_in_matrix_format(self):
        # while delta_weight_vec > threshold
        #  1. Gather data following an e-greedy policy
        #  2. Calculate A and b estimates
        #  3. calculate new_weight_vec, and delta_weight_vec
        # return policy greedy w.r.t last weight_vec
        self.policy = eGreedy(self.representation, epsilon=self.epsilon)

        # Number of samples to be used for each policy evaluation phase. L1 in
        # the Geramifard et. al. FTML 2012 paper
        self.samples_num = 1000

        self.start_time = clock()  # Used to track the total time for solving
        samples = 0
        converged = False
        iteration = 0
        while self.has_time() and not converged:

            #  1. Gather samples following an e-greedy policy
            S, Actions, NS, R, T = self.collect_samples(self.samples_num)
            samples += self.samples_num

            #  2. Calculate A and b estimates
            a_num = self.domain.num_actions
            n = self.representation.features_num
            discount_factor = self.domain.discount_factor

            self.A = np.zeros((n * a_num, n * a_num))
            self.b = np.zeros((n * a_num, 1))
            for i in range(self.samples_num):
                phi_s_a = self.representation.phi_sa(S[i], T[i],
                                                     Actions[i, 0]).reshape(
                                                         (-1, 1))
                E_phi_ns_na = self.calculate_expected_phi_ns_na(
                    S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1))
                d = phi_s_a - discount_factor * E_phi_ns_na
                self.A += np.outer(phi_s_a, d.T)
                self.b += phi_s_a * R[i, 0]

            #  3. calculate new_weight_vec, and delta_weight_vec
            new_weight_vec, solve_time = solveLinear(regularize(self.A),
                                                     self.b)
            iteration += 1
            if solve_time > 1:
                self.logger.info(
                    "#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)" %
                    (iteration, solve_time))
            weight_diff = l_norm(new_weight_vec -
                                 self.representation.weight_vec)
            converged = weight_diff < self.convergence_threshold
            self.representation.weight_vec = new_weight_vec
            (
                perf_return,
                perf_steps,
                perf_term,
                perf_disc_return,
            ) = self.performance_run()
            self.logger.info(
                "#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f"
                % (
                    iteration,
                    hhmmss(deltaT(self.start_time)),
                    samples,
                    weight_diff,
                    perf_return,
                ))
            if self._visualize_mode:
                self.domain.show_learning(self.representation)

            # store stats
            self.result["samples"].append(samples)
            self.result["return"].append(perf_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(
                self.representation.features_num)
            self.result["steps"].append(perf_steps)
            self.result["terminated"].append(perf_term)
            self.result["discounted_return"].append(perf_disc_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info("Converged!")

        self.log_value()

コード例 #9

0

ファイルを表示

ファイル: trajectory_based_policy_iteration.py プロジェクト: kngwyu/rlpy3

    def _solve_impl(self):
        """Solve the domain MDP."""

        self.start_time = clock()  # Used to track the total time for solving
        self.bellman_updates = 0
        converged = False
        PI_iteration = 0

        # The policy is maintained as separate copy of the representation.
        # This way as the representation is updated the policy remains intact
        policy = eGreedy(deepcopy(self.representation),
                         epsilon=0,
                         deterministic=True)
        a_num = self.domain.num_actions

        while self.has_time() and not converged:

            # Policy Improvement (Updating the representation of the value)
            self.traj_based_policy_evaluation(policy)
            PI_iteration += 1

            # Theta can increase in size if the representation
            # is expanded hence padding the weight vector with zeros
            additional_dim = (self.representation.features_num -
                              policy.representation.features_num)
            padded_theta = np.hstack(
                (policy.representation.weight, np.zeros(
                    (a_num, additional_dim))))

            # Calculate the change in the weight_vec as L2-norm
            weight_diff = np.linalg.norm(padded_theta -
                                         self.representation.weight)
            converged = weight_diff < self.convergence_threshold

            # Update the underlying value function of the policy
            policy.representation = deepcopy(
                self.representation)  # self.representation

            (
                perf_return,
                perf_steps,
                perf_term,
                perf_disc_return,
            ) = self.performance_run()
            self.logger.info(
                "PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, "
                "Return=%0.3f, steps=%d, features=%d" % (
                    PI_iteration,
                    hhmmss(deltaT(self.start_time)),
                    self.bellman_updates,
                    weight_diff,
                    perf_return,
                    perf_steps,
                    self.representation.features_num,
                ))

            if self._visualize_mode:
                self.domain.show_learning(self.representation)

            # store stats
            self.result["bellman_updates"].append(self.bellman_updates)
            self.result["return"].append(perf_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(
                self.representation.features_num)
            self.result["steps"].append(perf_steps)
            self.result["terminated"].append(perf_term)
            self.result["discounted_return"].append(perf_disc_return)
            self.result["policy_improvemnt_iteration"].append(PI_iteration)

        if converged:
            self.logger.info("Converged!")
        self.log_value()

コード例 #10

0

ファイルを表示

ファイル: policy_iteration.py プロジェクト: kngwyu/rlpy3

    def policy_evaluation(self, policy):
        """
        Evaluate a given policy: this is done by applying the Bellman backup over all
        states until the change is less than a given threshold.

        Returns: convergence status as a boolean
        """
        converged = False
        policy_evaluation_iteration = 0
        while (not converged and self.has_time()
               and policy_evaluation_iteration < self.max_pe_iterations):
            policy_evaluation_iteration += 1

            # Sweep The State Space
            for i in range(0, self.representation.num_states_total):

                # Check for solver time
                if not self.has_time():
                    break

                # Map an state ID to state
                s = self.representation.stateID2state(i)

                # Skip terminal states and states with no possible action
                possible_actions = self.domain.possible_actions(s=s)
                if self.domain.is_terminal(s) or len(possible_actions) == 0:
                    continue

                # Apply Bellman Backup
                self.bellman_backup(s, policy.pi(s, False, possible_actions),
                                    self.ns_samples, policy)

                # Update number of backups
                self.bellman_updates += 1

                # Check for the performance
                if self.bellman_updates % self.log_interval == 0:
                    performance_return = self.performance_run()[0]
                    self.logger.info("[%s]: BellmanUpdates=%d, Return=%0.4f" %
                                     (
                                         hhmmss(deltaT(self.start_time)),
                                         self.bellman_updates,
                                         performance_return,
                                     ))

            # check for convergence: L_infinity norm of the difference between the to
            # the weight vector of representation
            weight_diff = l_norm(policy.representation.weight -
                                 self.representation.weight)
            converged = weight_diff < self.convergence_threshold

            # Log Status
            self.logger.info(
                "PE #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f" %
                (
                    policy_evaluation_iteration,
                    hhmmss(deltaT(self.start_time)),
                    self.bellman_updates,
                    weight_diff,
                ))

            # Show Plots
            if self._visualize_mode:
                self.domain.show_learning(self.representation)
        return converged

コード例 #11

0

ファイルを表示

    def _solve_impl(self):
        """Solve the domain MDP."""

        self.start_time = clock(
        )  # Used to show the total time took the process
        bellman_updates = 0  # used to track the performance improvement.
        converged = False
        iteration = 0

        num_states = self.representation.num_states_total

        while self.has_time() and not converged:
            iteration += 1

            # Store the weight vector for comparison
            prev_weight = self.representation.weight.copy()

            # Sweep The State Space
            for i in range(num_states):

                s = self.representation.stateID2state(i)
                # Sweep through possible actions
                if self.domain.is_terminal(s):
                    continue
                for a in self.domain.possible_actions(s):

                    self.bellman_backup(s, a, ns_samples=self.ns_samples)
                    bellman_updates += 1

                    # Create Log
                    if bellman_updates % self.log_interval == 0:
                        performance_return, _, _, _ = self.performance_run()
                        self._log_updates(performance_return, bellman_updates)

            # check for convergence
            weight_diff = l_norm(prev_weight - self.representation.weight)
            converged = weight_diff < self.convergence_threshold

            # log the stats
            (
                perf_return,
                perf_steps,
                perf_term,
                perf_disc_return,
            ) = self.performance_run()
            self.logger.info(
                "PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, "
                "Return=%0.4f, Steps=%d" % (
                    iteration,
                    hhmmss(deltaT(self.start_time)),
                    bellman_updates,
                    weight_diff,
                    perf_return,
                    perf_steps,
                ))

            # Show the domain and value function
            if self._visualize_mode:
                self.domain.show_learning(self.representation)

            # store stats
            self.result["bellman_updates"].append(bellman_updates)
            self.result["return"].append(perf_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(
                self.representation.features_num)
            self.result["steps"].append(perf_steps)
            self.result["terminated"].append(perf_term)
            self.result["discounted_return"].append(perf_disc_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info("Converged!")
        self.log_value()

コード例 #12

0

ファイルを表示

 def _log_updates(self, perf_return, bellman_updates):
     dt = hhmmss(deltaT(self.start_tim))
     self.logger.info("[%s]: BellmanUpdates=%d, Return=%0.4f" %
                      (dt, bellman_updates, perf_return))

コード例 #13

0

ファイルを表示

    def _solve_impl(self):
        """Solve the domain MDP."""

        # Used to show the total time took the process
        self.start_time = clock()
        bellman_updates = 0
        converged = False
        iteration = 0
        # Track the number of consequent trajectories with very small observed
        # BellmanError
        converged_trajectories = 0
        while self.has_time() and not converged:
            max_bellman_error = 0
            step = 0
            s, terminal, p_actions = self.domain.s0()
            # Generate a new episode e-greedy with the current values
            while not terminal and step < self.domain.episode_cap and self.has_time(
            ):
                a = self.eps_greedy(s, terminal, p_actions)
                bellman_error, phi_s, phi_s_a = self._bellman_error(
                    s, a, terminal)
                # Update Parameters
                self.representation.weight_vec += self.alpha * bellman_error * phi_s_a
                bellman_updates += 1
                step += 1

                # Discover features if the representation has the discover method
                if hasattr(self.representation, "discover"):
                    self.representation.post_discover(phi_s, bellman_error)

                max_bellman_error = max(max_bellman_error, abs(bellman_error))
                # Simulate new state and action on trajectory
                _, s, terminal, p_actions = self.domain.step(a)

            # check for convergence
            iteration += 1
            if max_bellman_error < self.convergence_threshold:
                converged_trajectories += 1
            else:
                converged_trajectories = 0
            (
                perf_return,
                perf_steps,
                perf_term,
                perf_disc_return,
            ) = self.performance_run()
            converged = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES
            self.logger.info(
                "PI #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Return=%0.4f,"
                "Steps=%d, Features=%d" % (
                    iteration,
                    hhmmss(deltaT(self.start_time)),
                    bellman_updates,
                    max_bellman_error,
                    perf_return,
                    perf_steps,
                    self.representation.features_num,
                ))
            if self._visualize_mode:
                self.domain.show_learning(self.representation)

            # store stats
            self.result["bellman_updates"].append(bellman_updates)
            self.result["return"].append(perf_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(
                self.representation.features_num)
            self.result["steps"].append(perf_steps)
            self.result["terminated"].append(perf_term)

            self.result["discounted_return"].append(perf_disc_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info("Converged!")
        self.log_value()