Ejemplo n.º 1
0
    def run_value_iteration(self, solver, epoch):
        run_start_time = time.time()

        reward = 0
        discounted_reward = 0
        discount = 1.0

        solver.value_iteration(self.model.get_transition_matrix(),
                               self.model.get_observation_matrix(),
                               self.model.get_reward_matrix(),
                               self.model.planning_horizon)

        b = self.model.get_initial_belief_state()

        for i in range(self.model.max_steps):

            # TODO: record average V(b) per epoch
            action, v_b = solver.select_action(b, solver.gamma)

            step_result = self.model.generate_step(action)

            if not step_result.is_terminal:
                b = self.model.belief_update(b, action,
                                             step_result.observation)

            reward += step_result.reward
            discounted_reward += discount * step_result.reward
            discount *= self.model.discount

            # show the step result
            self.display_step_result(i, step_result)

            if step_result.is_terminal:
                console(3, module,
                        'Terminated after episode step ' + str(i + 1))
                break

            # TODO: add belief state History sequence

        self.results.time.add(time.time() - run_start_time)
        self.results.update_reward_results(reward, discounted_reward)

        # Pretty Print results
        self.results.show(epoch)
        console(
            3, module, 'Total possible undiscounted return: ' +
            str(self.model.get_max_undiscounted_return()))
        print_divider('medium')

        self.experiment_results.time.add(self.results.time.running_total)
        self.experiment_results.undiscounted_return.count += (
            self.results.undiscounted_return.count - 1)
        self.experiment_results.undiscounted_return.add(
            self.results.undiscounted_return.running_total)
        self.experiment_results.discounted_return.count += (
            self.results.discounted_return.count - 1)
        self.experiment_results.discounted_return.add(
            self.results.discounted_return.running_total)
Ejemplo n.º 2
0
 def prune(self, belief_node):
     """
     Prune the siblings of the chosen belief node and
     set that node as the new "root"
     :return:
     """
     start_time = time.time()
     self.belief_tree.prune_siblings(belief_node)
     elapsed = time.time() - start_time
     console(2, module, "Time spent pruning = " + str(elapsed) + " seconds")
Ejemplo n.º 3
0
 def show(self, epoch):
     print_divider('large')
     print('\tEpoch #' + str(epoch) + ' RESULTS')
     print_divider('large')
     console(2, module, 'discounted return statistics')
     print_divider('medium')
     self.discounted_return.show()
     print_divider('medium')
     console(2, module, 'undiscounted return statistics')
     print_divider('medium')
     self.undiscounted_return.show()
Ejemplo n.º 4
0
    def display_step_result(step_num, step_result):
        """
        Pretty prints step result information
        :param step_num:
        :param step_result:
        :return:
        """

        console(3, module, 'Step Number = ' + str(step_num))
        console(3, module, 'Step Result.Action = ' + step_result.action.to_string())
        console(3, module, 'Step Result.Observation = ' + step_result.observation.to_string())
        console(3, module, 'Step Result.Next_State = ' + step_result.next_state.to_string())
        console(3, module, 'Step Result.Reward = ' + str(step_result.reward))
Ejemplo n.º 5
0
    def discounted_return(self):

        if self.model.solver == 'ValueIteration':
            solver = self.solver_factory(self)

            self.run_value_iteration(solver, 1)

            if self.model.save:
                save_pkl(solver.gamma,
                         os.path.join(self.model.weight_dir,
                                      'VI_planning_horizon_{}.pkl'.format(self.model.planning_horizon)))

        elif not self.model.use_tf:
            self.multi_epoch()
        else:
            self.multi_epoch_tf()

        print('\n')
        console(2, module, 'epochs: ' + str(self.model.n_epochs))
        console(2, module, 'ave undiscounted return/step: ' + str(self.experiment_results.undiscounted_return.mean) +
                ' +- ' + str(self.experiment_results.undiscounted_return.std_err()))
        console(2, module, 'ave discounted return/step: ' + str(self.experiment_results.discounted_return.mean) +
                ' +- ' + str(self.experiment_results.discounted_return.std_err()))
        console(2, module, 'ave time/epoch: ' + str(self.experiment_results.time.mean))

        self.logger.info('env: ' + self.model.env + '\t' +
                         'epochs: ' + str(self.model.n_epochs) + '\t' +
                         'ave undiscounted return: ' + str(self.experiment_results.undiscounted_return.mean) + ' +- ' +
                         str(self.experiment_results.undiscounted_return.std_err()) + '\t' +
                         'ave discounted return: ' + str(self.experiment_results.discounted_return.mean) +
                         ' +- ' + str(self.experiment_results.discounted_return.std_err()) +
                         '\t' + 'ave time/epoch: ' + str(self.experiment_results.time.mean))
Ejemplo n.º 6
0
def display_step_result(step_num, step_result):
    """
    Pretty prints step result information
    :param step_num:
    :param step_result:
    :return:
    """
    print_divider("large")
    console(2, module, "Step Number = " + str(step_num))
    console(2, module,
            "Step Result.Action = " + step_result.action.to_string())
    console(2, module,
            "Step Result.Observation = " + step_result.observation.to_string())
    console(2, module,
            "Step Result.Next_State = " + step_result.next_state.to_string())
    console(2, module, "Step Result.Reward = " + str(step_result.reward))
Ejemplo n.º 7
0
    def multi_run(self):
        num_runs = self.model.sys_cfg["num_runs"]

        for i in range(num_runs):

            console(
                2, module, "Starting run " + str(i + 1) + " with " +
                str(self.model.sys_cfg["num_sims"]) + " simulations")

            self.run()
            total_time = self.results.time.mean * self.results.time.count

            if total_time > self.model.sys_cfg["max_time_out"]:
                console(
                    2, module, "Timed out after " + str(i) + " runs in " +
                    total_time + " seconds")
Ejemplo n.º 8
0
 def display_step_result(step_num, step_result):
     """
     Pretty prints step result information
     :param step_num:
     :param step_result:
     :return:
     """
     console(3, module, 'Step Number = ' + str(step_num))
     if(step_result.action.to_string() == 'CHECK'):
         string = step_result.action.to_string() + ' rock ' + str(step_result.action.rock_no)
     else:
         string = step_result.action.to_string()
     console(3, module,string)
     console(3, module, 'Step Result.Observation = ' + step_result.observation.to_string())
     console(3, module, 'Step Result.Next_State = ' + step_result.next_state.to_string())
     console(3, module, 'Step Result.Reward = ' + str(step_result.reward))
Ejemplo n.º 9
0
    def multi_epoch(self):
        eps = self.model.epsilon_start

        self.model.reset_for_epoch()

        for i in range(self.model.n_epochs):
            # Reset the epoch stats
            self.results = Results()

            if self.model.solver == 'POMCP':
                eps = self.run_pomcp(i + 1, eps)
                self.model.reset_for_epoch()

            if self.experiment_results.time.running_total > self.model.timeout:
                console(2, module, 'Timed out after ' + str(i) + ' epochs in ' +
                        self.experiment_results.time.running_total + ' seconds')
                break
Ejemplo n.º 10
0
    def policy_iteration(self):
        """
        Template-method pattern

        For on-policy learning algorithms such as SARSA, this method will carry out the
        policy iteration. Afterwards, the learned policy can be evaluated by consecutive calls to
        select_action(), which specifies the action selection rule

        For off-policy learning algorithms such as Q-learning, this method will repeatedly be called
        at each step of the policy traversal

        The policy iterator does not advance

        :return:
        """
        start_time = time.time()

        self.total_reward_stats.clear()

        # save the state of the current belief
        # only passing a reference to the action map
        current_belief = self.policy_iterator.copy()

        for i in range(self.model.sys_cfg["num_sims"]):
            # Reset the Simulator
            self.model.reset_for_simulation()

            state = self.policy_iterator.sample_particle()

            console(
                3, module,
                "Starting simulation at random state = " + state.to_string())

            approx_value = self.simulate(state, start_time, i)

            self.total_reward_stats.add(approx_value)

            console(
                3, module,
                "Approximation of the value function = " + str(approx_value))

            # reset the policy iterator
            self.policy_iterator = current_belief
Ejemplo n.º 11
0
    def multi_epoch(self):
        eps = self.model.epsilon_start

        self.model.reset_for_epoch()

        for i in range(
                self.model.n_epochs
        ):  # Num of epochs of the experiment to conduct, default 100
            # Reset the epoch stats
            self.results = Results()

            if self.model.solver == 'POMCP':  #how many times the belief states will be sampled, s from B(h)
                eps = self.run_pomcp(i + 1, eps)
                self.model.reset_for_epoch()
                print("##########################")

            if self.experiment_results.time.running_total > self.model.timeout:
                console(
                    2, module, 'Timed out after ' + str(i) + ' epochs in ' +
                    self.experiment_results.time.running_total + ' seconds')
                break
Ejemplo n.º 12
0
    def traverse(self, belief_node, tree_depth, start_time):
        delayed_reward = 0

        state = belief_node.sample_particle()

        # Time expired
        if time.time() - start_time > self.model.action_selection_timeout:
            console(4, module, "action selection timeout")
            return 0

        action = ucb_action(self, belief_node, False)

        # Search horizon reached
        if tree_depth >= self.model.max_depth:
            console(4, module, "Search horizon reached")
            return 0

        step_result, is_legal = self.model.generate_step(state, action)

        child_belief_node = belief_node.child(action, step_result.observation)
        if child_belief_node is None and not step_result.is_terminal and belief_node.action_map.total_visit_count > 0:
            child_belief_node, added = belief_node.create_or_get_child(
                action, step_result.observation)

        if not step_result.is_terminal or not is_legal:
            tree_depth += 1
            if child_belief_node is not None:
                # Add S' to the new belief node
                # Add a state particle with the new state
                if child_belief_node.state_particles.__len__(
                ) < self.model.max_particle_count:
                    child_belief_node.state_particles.append(
                        step_result.next_state)
                delayed_reward = self.traverse(child_belief_node, tree_depth,
                                               start_time)
            else:
                delayed_reward = self.rollout(belief_node)
            tree_depth -= 1
        else:
            console(4, module, "Reached terminal state.")

        # delayed_reward is "Q maximal"
        # current_q_value is the Q value of the current belief-action pair
        action_mapping_entry = belief_node.action_map.get_entry(
            action.bin_number)

        q_value = action_mapping_entry.mean_q_value
        # STEP update p value as well
        p_value = action_mapping_entry.mean_p_value
        # off-policy Q learning update rule
        q_value += (step_result.reward +
                    (self.model.discount * delayed_reward) - q_value)
        p_value += 1 if step_result.observation.is_obstacle or not is_legal else 0
        action_mapping_entry.update_visit_count(1)
        action_mapping_entry.update_q_value(q_value)
        action_mapping_entry.update_p_value(p_value)

        # Add RAVE ?
        return q_value
Ejemplo n.º 13
0
 def show():
     print_divider("large")
     print "\tRUN RESULTS"
     print_divider("large")
     console(2, module, "Discounted Return statistics")
     print_divider("medium")
     Results.discounted_return.show()
     print_divider("medium")
     console(2, module, "Un-discounted Return statistics")
     print_divider("medium")
     Results.undiscounted_return.show()
     print_divider("medium")
     console(2, module, "Time")
     print_divider("medium")
     Results.time.show()
     print_divider("medium")
Ejemplo n.º 14
0
    def discounted_return(self):

        self.multi_epoch()

        print('\n')
        console(2, module, 'epochs: ' + str(self.model.n_epochs))
        console(2, module, 'ave undiscounted return/step: ' + str(self.experiment_results.undiscounted_return.mean) +
                ' +- ' + str(self.experiment_results.undiscounted_return.std_err()))
        console(2, module, 'ave discounted return/step: ' + str(self.experiment_results.discounted_return.mean) +
                ' +- ' + str(self.experiment_results.discounted_return.std_err()))
        # console(2, module, 'ave time/epoch: ' + str(self.experiment_results.time.mean))

        self.logger.info('env: ' + self.model.env + '\t' +
                         'epochs: ' + str(self.model.n_epochs) + '\t' +
                         'ave undiscounted return: ' + str(self.experiment_results.undiscounted_return.mean) + ' +- ' +
                         str(self.experiment_results.undiscounted_return.std_err()) + '\t' +
                         'ave discounted return: ' + str(self.experiment_results.discounted_return.mean) +
                         ' +- ' + str(self.experiment_results.discounted_return.std_err()) +
                         '\t' + 'ave time/epoch: ' + str(self.experiment_results.time.mean))

        return self.policy['optimal_traj']
Ejemplo n.º 15
0
 def reset_for_epoch(self):
     self.actual_rock_states = self.sample_rocks()
     console(2, module,
             "Actual rock states = " + str(self.actual_rock_states))
Ejemplo n.º 16
0
    def run(self, num_steps=None):
        run_start_time = time.time()
        discount = 1.0

        if num_steps is None:
            num_steps = self.model.sys_cfg["num_steps"]

        # Reset the running total for each statistic for this run
        self.results.reset_running_totals()

        # Create a new solver
        solver = self.solver_factory(self, self.model)

        # Perform sim behaviors that must done for each run
        self.model.reset_for_run()

        console(
            2, module, "num of particles generated = " +
            str(solver.belief_tree.root.state_particles.__len__()))

        if solver.on_policy:
            solver.policy_iteration()

        # Monte-Carlo start state
        state = self.model.sample_an_init_state()
        console(2, module, "Initial search state: " + state.to_string())

        for i in range(num_steps):
            start_time = time.time()

            # action will be of type Discrete Action
            action = solver.select_action()

            step_result, is_legal = self.model.generate_step(state, action)

            self.results.reward.add(step_result.reward)
            self.results.undiscounted_return.running_total += step_result.reward
            self.results.discounted_return.running_total += (
                step_result.reward * discount)

            discount *= self.model.sys_cfg["discount"]
            state = step_result.next_state

            # show the step result
            display_step_result(i, step_result)

            if not step_result.is_terminal:
                solver.update(step_result)

            # Extend the history sequence
            new_hist_entry = solver.history.add_entry()
            new_hist_entry.reward = step_result.reward
            new_hist_entry.action = step_result.action
            new_hist_entry.observation = step_result.observation
            new_hist_entry.register_entry(new_hist_entry, None,
                                          step_result.next_state)

            if step_result.is_terminal:
                console(2, module, "Terminated after episode step " + str(i))
                break

            console(
                2, module, "MCTS step forward took " +
                str(time.time() - start_time) + " seconds")

        self.results.time.add(time.time() - run_start_time)
        self.results.discounted_return.add(
            self.results.discounted_return.running_total)
        self.results.undiscounted_return.add(
            self.results.undiscounted_return.running_total)

        # Pretty Print results
        print_divider("large")
        solver.history.show()
        self.results.show()
        console(
            2, module, "Max possible total Un-discounted Return: " +
            str(self.model.get_max_undiscounted_return()))
        print_divider("medium")
Ejemplo n.º 17
0
    def traverse(self, belief_node, tree_depth, start_time):
        delayed_reward = 0

        state = belief_node.sample_particle(
        )  #s~B, belief tree does not change, just change sampled state

        #sample action
        action = ucb_action(
            self, belief_node, False
        )  #argmax action, inside simuation, cal q-value to expand tree

        # Search horizon reached
        if tree_depth >= self.model.max_depth:
            console(4, module, "Search horizon reached")
            return 0

        step_result, is_legal = self.model.generate_step(state,
                                                         action)  #black box

        #h' <- (h, a, o)
        # get belief node, but could be none
        child_belief_node = belief_node.child(action,
                                              step_result.observation)  #

        if child_belief_node is None and not step_result.is_terminal and belief_node.action_map.total_visit_count > 0:
            child_belief_node, added = belief_node.create_or_get_child(
                action, step_result.observation)

        if not step_result.is_terminal or not is_legal:
            tree_depth += 1
            if child_belief_node is not None:
                # Add S' to the new belief node
                # Add a state particle with the new state
                if child_belief_node.state_particles.__len__(
                ) < self.model.max_particle_count:
                    child_belief_node.state_particles.append(
                        step_result.next_state)
                delayed_reward = self.traverse(child_belief_node, tree_depth,
                                               start_time)  #recursion
            else:
                delayed_reward = self.rollout(
                    belief_node)  # if child_belief_node is None
            tree_depth -= 1
        else:
            console(4, module, "Reached terminal state.")

        # delayed_reward is "Q maximal"
        # current_q_value is the Q value of the current belief-action pair
        action_mapping_entry = belief_node.action_map.get_entry(
            action.bin_number)

        q_value = action_mapping_entry.mean_q_value

        # off-policy Q learning update rule
        q_value += (step_result.reward +
                    (self.model.discount * delayed_reward) - q_value)

        action_mapping_entry.update_visit_count(1)
        action_mapping_entry.update_q_value(q_value)

        # Add RAVE ?
        return q_value
Ejemplo n.º 18
0
    def run_pomcp(self, epoch, eps):
        epoch_start = time.time()

        # Create a new solver
        solver = self.solver_factory(self)

        # Monte-Carlo start state
        state = solver.belief_tree_index.sample_particle()

        reward = 0
        discounted_reward = 0
        discount = 1.0

        for i in range(self.model.max_steps):

            start_time = time.time()

            # action will be of type Discrete Action
            action = solver.select_eps_greedy_action(eps, start_time)

            # update epsilon
            if eps > self.model.epsilon_minimum:
                eps *= self.model.epsilon_decay

            step_result, is_legal = self.model.generate_step(state, action)

            reward += step_result.reward
            discounted_reward += discount * step_result.reward

            discount *= self.model.discount
            state = step_result.next_state

            # show the step result
            self.display_step_result(i, step_result)

            if not step_result.is_terminal or not is_legal:
                solver.update(step_result)

            # Extend the history sequence
            new_hist_entry = solver.history.add_entry()
            HistoryEntry.update_history_entry(new_hist_entry,
                                              step_result.reward,
                                              step_result.action,
                                              step_result.observation,
                                              step_result.next_state)

            if step_result.is_terminal or not is_legal:
                console(3, module,
                        'Terminated after episode step ' + str(i + 1))
                break

        self.results.time.add(time.time() - epoch_start)
        self.results.update_reward_results(reward, discounted_reward)

        # Pretty Print results
        # print_divider('large')
        solver.history.show()
        self.results.show(epoch)
        console(
            3, module, 'Total possible undiscounted return: ' +
            str(self.model.get_max_undiscounted_return()))
        print_divider('medium')

        self.experiment_results.time.add(self.results.time.running_total)
        self.experiment_results.undiscounted_return.count += (
            self.results.undiscounted_return.count - 1)
        self.experiment_results.undiscounted_return.add(
            self.results.undiscounted_return.running_total)
        self.experiment_results.discounted_return.count += (
            self.results.discounted_return.count - 1)
        self.experiment_results.discounted_return.add(
            self.results.discounted_return.running_total)

        return eps
Ejemplo n.º 19
0
 def reset_for_epoch(self):
     self.actual_cell_states = self.sample_cells()
     console(2, module,
             "Actual cell states = " + str(self.actual_cell_states))
Ejemplo n.º 20
0
    def update(self, step_result, prune=True):
        """
        Feed back the step result, updating the belief_tree,
        extending the history, updating particle sets, etc

        Advance the policy index to point to the next belief node in the episode

        :return:
        """
        # Update the Simulator with the Step Result
        # This is important in case there are certain actions that change the state of the simulator
        self.model.update(step_result)

        child_belief_node = self.belief_tree_index.get_child(step_result.action, step_result.observation)

        # If the child_belief_node is None because the step result randomly produced a different observation,
        # grab any of the beliefs extending from the belief node's action node
        if child_belief_node is None:
            action_node = self.belief_tree_index.action_map.get_action_node(step_result.action)
            if action_node is None:
                # I grabbed a child belief node that doesn't have an action node. Use rollout from here on out.
                console(2, module, "Reached branch with no leaf nodes, using random rollout to finish the episode")
                print "Should not get here!"
                exit()
                self.disable_tree = True
                return

            obs_mapping_entries = list(action_node.observation_map.child_map.values())

            for entry in obs_mapping_entries:
                if entry.child_node is not None:
                    child_belief_node = entry.child_node
                    console(2, module, "Had to grab nearest belief node...variance added")
                    print "if get here, we need to think about this case!"
                    exit()
                    break

        # If the new root does not yet have the max possible number of particles add some more
        if child_belief_node.state_particles.__len__() < self.model.max_particle_count:

            num_to_add = self.model.max_particle_count - child_belief_node.state_particles.__len__()

            # Generate particles for the new root node
            child_belief_node.state_particles += self.model.generate_particles(self.belief_tree_index, step_result.action,
                                                                               step_result.observation, num_to_add,
                                                                               self.belief_tree_index.state_particles)

            # If that failed, attempt to create a new state particle set
            if child_belief_node.state_particles.__len__() == 0:
                print "you will not believe this ever becoming zero!"
                exit()
                child_belief_node.state_particles += self.model.generate_particles_uninformed(self.belief_tree_index,
                                                                                              step_result.action,
                                                                                              step_result.observation,
                                                                                        self.model.min_particle_count)

        # Failed to continue search- ran out of particles
        if child_belief_node is None or child_belief_node.state_particles.__len__() == 0:
            console(1, module, "Couldn't refill particles, must use random rollout to finish episode")
            exit()
            self.disable_tree = True
            return

        self.belief_tree_index = child_belief_node
        if prune:
            self.prune(self.belief_tree_index)
Ejemplo n.º 21
0
    def run_pomcp(self, epoch, eps):
        epoch_start = time.time()

        # Create a new solver, includes UCB, belief tree stuff
        solver = self.solver_factory(
            self
        )  # first build a belief tree, belief tree has many state particles

        # Monte-Carlo start state, random sample a start state
        state = solver.belief_tree_index.sample_particle()

        reward = 0
        discounted_reward = 0
        discount = 1.0

        #the process of change root of belief tree, can think of it as building history(build tree by choose action)
        for i in range(
                self.model.max_steps
        ):  #max_steps Max num of steps per trial/episode/trajectory/epoch, 200

            start_time = time.time()

            # action will be of type Discrete Action, choose action after planning
            # this is the pomcp planner (plan), for one belief state, simulate 500 times
            action = solver.select_eps_greedy_action(
                eps, start_time)  #eps is tree depth

            ########################################################################
            # update epsilon
            if eps > self.model.epsilon_minimum:
                eps *= self.model.epsilon_decay

            # this is the execution stage(act and sense),
            # choose the action and actually execute, get next state and observation
            step_result, is_legal = self.model.generate_step(state, action)

            reward += step_result.reward
            discounted_reward += discount * step_result.reward

            discount *= self.model.discount
            state = step_result.next_state

            print("inside step loop: {}".format(i))
            # show the step result
            self.display_step_result(i, step_result)

            if not step_result.is_terminal or not is_legal:
                solver.update(
                    step_result
                )  #update belief state  and prune, the belief state tree gets changed

            # Extend the history sequence
            new_hist_entry = solver.history.add_entry()
            HistoryEntry.update_history_entry(new_hist_entry,
                                              step_result.reward,
                                              step_result.action,
                                              step_result.observation,
                                              step_result.next_state)

            if step_result.is_terminal or not is_legal:
                console(3, module,
                        'Terminated after episode step ' + str(i + 1))
                break

        self.results.time.add(time.time() - epoch_start)
        self.results.update_reward_results(reward, discounted_reward)

        # Pretty Print results
        # print_divider('large')
        solver.history.show()
        self.results.show(epoch)
        console(
            3, module, 'Total possible undiscounted return: ' +
            str(self.model.get_max_undiscounted_return()))
        print_divider('medium')

        self.experiment_results.time.add(self.results.time.running_total)
        self.experiment_results.undiscounted_return.count += (
            self.results.undiscounted_return.count - 1)
        self.experiment_results.undiscounted_return.add(
            self.results.undiscounted_return.running_total)
        self.experiment_results.discounted_return.count += (
            self.results.discounted_return.count - 1)
        self.experiment_results.discounted_return.add(
            self.results.discounted_return.running_total)

        return eps
Ejemplo n.º 22
0
    def traverse(self, belief_node, tree_depth, start_time):
        delayed_reward = 0

        state = belief_node.sample_particle()

        # Time expired
        if time.time() - start_time > self.model.action_selection_timeout:
            console(4, module, "action selection timeout")
            return 0

        action = ucb_action(self, belief_node, False)

        # Search horizon reached
        if tree_depth >= self.model.max_depth:
            console(4, module, "Search horizon reached")
            return 0

        step_result, is_legal = self.model.generate_step(state, action)

        # if belief_node->action_node child->belief_node child exists
        # copy all the data from belief_node to the (a,o) child belief node
        # print "simulate: action=", action.bin_number, " obs=", step_result.observation.is_good, "total visit=", belief_node.action_map.total_visit_count, "depth=", belief_node.depth
        child_belief_node = belief_node.child(action, step_result.observation)

        # grow the belief tree by constructing the new child_belief_node
        if child_belief_node is None and not step_result.is_terminal and belief_node.visited:
            child_belief_node, added = belief_node.create_or_get_child(
                action, step_result.observation)

        if not step_result.is_terminal or not is_legal:
            if child_belief_node is not None:
                tree_depth += 1
                # Add S' to the new belief node
                # Add a state particle with the new state
                if child_belief_node.state_particles.__len__(
                ) < self.model.max_particle_count:
                    child_belief_node.state_particles.append(
                        step_result.next_state)
                delayed_reward = self.traverse(child_belief_node, tree_depth,
                                               start_time)
            else:
                delayed_reward = self.rollout_from_state(state)
                belief_node.visited = True
                # total_reward = step_result.reward + (self.model.discount * delayed_reward)
                # return total_reward
        else:
            console(4, module, "Reached terminal state.")

        # delayed_reward is "Q maximal"
        # current_q_value is the Q value of the current belief-action pair
        action_mapping_entry = belief_node.action_map.get_entry(
            action.bin_number)

        q_value = action_mapping_entry.mean_q_value

        # off-policy Q learning update rule
        q_value += (step_result.reward +
                    (self.model.discount * delayed_reward) - q_value)

        action_mapping_entry.update_visit_count(1)
        action_mapping_entry.update_q_value(q_value)

        #off_policy Q learning update
        max_q_value = -np.inf
        for action_entry in belief_node.action_map.entries.values():
            if action_entry.mean_q_value > max_q_value:
                max_q_value = action_entry.mean_q_value

        # Add RAVE ?
        return max_q_value
Ejemplo n.º 23
0
    def run_pomcp(self, epoch, eps):
        epoch_start = time.time()

        # Create a new solver
        solver = self.solver_factory(self)

        # Monte-Carlo start state
        state = solver.belief_tree_index.sample_particle()

        # NOTE: rock example specific
        self.model.set_rock_states(state)

        reward = 0
        discounted_reward = 0
        discount = 1.0

        solver.show_current_belief()

        for i in range(self.model.max_steps):

            start_time = time.time()

            # action will be of type Discrete Action
            action = solver.select_eps_greedy_action(eps, start_time)

            # print("selected action : " + str(action.bin_number))
            # raw_input("Press Enter to continue...")

            # update epsilon
            if eps > self.model.epsilon_minimum:
                eps *= self.model.epsilon_decay

            step_result, is_legal = self.model.generate_step(state, action)

            reward += step_result.reward
            discounted_reward += discount * step_result.reward

            discount *= self.model.discount
            state = step_result.next_state

            # show the step result
            self.display_step_result(i, step_result)

            if not step_result.is_terminal or not is_legal:
                # prune the tree and augment the child belief node to proceed with enough particles that match the current (a,o)
                solver.update(step_result)
            
            solver.show_current_belief()

            # Extend the history sequence
            new_hist_entry = solver.history.add_entry()
            HistoryEntry.update_history_entry(new_hist_entry, step_result.reward,
                                              step_result.action, step_result.observation, step_result.next_state)

            if step_result.is_terminal or not is_legal:
                console(3, module, 'Terminated after episode step ' + str(i + 1))
                break

        self.results.time.add(time.time() - epoch_start)
        self.results.update_reward_results(reward, discounted_reward)

        # Pretty Print results
        # print_divider('large')
        solver.history.show()
        self.results.show(epoch)
        console(3, module, 'Total possible undiscounted return: ' + str(self.model.get_max_undiscounted_return()))
        print_divider('medium')

        self.experiment_results.time.add(self.results.time.running_total)
        self.experiment_results.undiscounted_return.count += (self.results.undiscounted_return.count - 1)
        self.experiment_results.undiscounted_return.add(self.results.undiscounted_return.running_total)
        self.experiment_results.discounted_return.count += (self.results.discounted_return.count - 1)
        self.experiment_results.discounted_return.add(self.results.discounted_return.running_total)

        return eps
Ejemplo n.º 24
0
    def discounted_return(self):
        """
        Encapsulates logging and begins the runs
        :return:
        """
        console(2, module, "Main runs")

        self.logger.info(
            "Simulations\tRuns\tUndiscounted Return\tUndiscounted Error\t" +
            "\tDiscounted Return\tDiscounted Error\tTime")

        self.multi_run()

        console(2, module,
                "Simulations = " + str(self.model.sys_cfg["num_sims"]))
        console(2, module, "Runs = " + str(self.results.time.count))
        console(
            2, module, "Undiscounted Return = " +
            str(self.results.undiscounted_return.mean) + " +- " +
            str(self.results.undiscounted_return.std_err()))
        console(
            2, module,
            "Discounted Return = " + str(self.results.discounted_return.mean) +
            " +- " + str(self.results.discounted_return.std_err()))
        console(2, module, "Time = " + str(self.results.time.mean))

        self.logger.info(
            str(self.model.sys_cfg["num_sims"]) + '\t' +
            str(self.results.time.count) + '\t' + '\t' +
            str(self.results.undiscounted_return.mean) + '\t' +
            str(self.results.undiscounted_return.std_err()) + '\t' + '\t' +
            str(self.results.discounted_return.mean) + '\t' +
            str(self.results.discounted_return.std_err()) + '\t' + '\t' +
            str(self.results.time.mean))
Ejemplo n.º 25
0
 def set_rock_states(self, state):
     self.actual_rock_states = state.rock_states
     console(2, module,
             "Actual rock states = " + str(self.actual_rock_states))