Exemple #1
0
    def run_value_iteration(self, solver, epoch):
        run_start_time = time.time()

        reward = 0
        discounted_reward = 0
        discount = 1.0

        solver.value_iteration(self.model.get_transition_matrix(),
                               self.model.get_observation_matrix(),
                               self.model.get_reward_matrix(),
                               self.model.planning_horizon)

        b = self.model.get_initial_belief_state()

        for i in range(self.model.max_steps):

            # TODO: record average V(b) per epoch
            action, v_b = solver.select_action(b, solver.gamma)

            step_result = self.model.generate_step(action)

            if not step_result.is_terminal:
                b = self.model.belief_update(b, action,
                                             step_result.observation)

            reward += step_result.reward
            discounted_reward += discount * step_result.reward
            discount *= self.model.discount

            # show the step result
            self.display_step_result(i, step_result)

            if step_result.is_terminal:
                console(3, module,
                        'Terminated after episode step ' + str(i + 1))
                break

            # TODO: add belief state History sequence

        self.results.time.add(time.time() - run_start_time)
        self.results.update_reward_results(reward, discounted_reward)

        # Pretty Print results
        self.results.show(epoch)
        console(
            3, module, 'Total possible undiscounted return: ' +
            str(self.model.get_max_undiscounted_return()))
        print_divider('medium')

        self.experiment_results.time.add(self.results.time.running_total)
        self.experiment_results.undiscounted_return.count += (
            self.results.undiscounted_return.count - 1)
        self.experiment_results.undiscounted_return.add(
            self.results.undiscounted_return.running_total)
        self.experiment_results.discounted_return.count += (
            self.results.discounted_return.count - 1)
        self.experiment_results.discounted_return.add(
            self.results.discounted_return.running_total)
 def show(self):
     print "Displaying history sequence..."
     for entry in self.entry_sequence:
         print_divider("medium")
         print "id: ", entry.id
         print "action: ", entry.action.to_string()
         print "observation: ", entry.observation.to_string()
         print "next state: ", entry.state.to_string()
         print "reward: ", entry.reward
     time.sleep(2)  # pause for 2 seconds
Exemple #3
0
 def show(self):
     print_divider("medium")
     print("\tDisplaying history sequence")
     for entry in self.entry_sequence:
         print_divider("medium")
         print("id: ", entry.id)
         print("action: ", entry.action.to_string())
         print("observation: ", entry.observation.to_string())
         print("next state: ", entry.state.to_string())
         print("reward: ", entry.reward)
Exemple #4
0
 def show(self, epoch):
     print_divider('large')
     print('\tEpoch #' + str(epoch) + ' RESULTS')
     print_divider('large')
     console(2, module, 'discounted return statistics')
     print_divider('medium')
     self.discounted_return.show()
     print_divider('medium')
     console(2, module, 'undiscounted return statistics')
     print_divider('medium')
     self.undiscounted_return.show()
Exemple #5
0
def display_step_result(step_num, step_result):
    """
    Pretty prints step result information
    :param step_num:
    :param step_result:
    :return:
    """
    print_divider("large")
    console(2, module, "Step Number = " + str(step_num))
    console(2, module,
            "Step Result.Action = " + step_result.action.to_string())
    console(2, module,
            "Step Result.Observation = " + step_result.observation.to_string())
    console(2, module,
            "Step Result.Next_State = " + step_result.next_state.to_string())
    console(2, module, "Step Result.Reward = " + str(step_result.reward))
Exemple #6
0
 def show():
     print_divider("large")
     print "\tRUN RESULTS"
     print_divider("large")
     console(2, module, "Discounted Return statistics")
     print_divider("medium")
     Results.discounted_return.show()
     print_divider("medium")
     console(2, module, "Un-discounted Return statistics")
     print_divider("medium")
     Results.undiscounted_return.show()
     print_divider("medium")
     console(2, module, "Time")
     print_divider("medium")
     Results.time.show()
     print_divider("medium")
Exemple #7
0
    def run(self, num_steps=None):
        run_start_time = time.time()
        discount = 1.0

        if num_steps is None:
            num_steps = self.model.sys_cfg["num_steps"]

        # Reset the running total for each statistic for this run
        self.results.reset_running_totals()

        # Create a new solver
        solver = self.solver_factory(self, self.model)

        # Perform sim behaviors that must done for each run
        self.model.reset_for_run()

        console(
            2, module, "num of particles generated = " +
            str(solver.belief_tree.root.state_particles.__len__()))

        if solver.on_policy:
            solver.policy_iteration()

        # Monte-Carlo start state
        state = self.model.sample_an_init_state()
        console(2, module, "Initial search state: " + state.to_string())

        for i in range(num_steps):
            start_time = time.time()

            # action will be of type Discrete Action
            action = solver.select_action()

            step_result, is_legal = self.model.generate_step(state, action)

            self.results.reward.add(step_result.reward)
            self.results.undiscounted_return.running_total += step_result.reward
            self.results.discounted_return.running_total += (
                step_result.reward * discount)

            discount *= self.model.sys_cfg["discount"]
            state = step_result.next_state

            # show the step result
            display_step_result(i, step_result)

            if not step_result.is_terminal:
                solver.update(step_result)

            # Extend the history sequence
            new_hist_entry = solver.history.add_entry()
            new_hist_entry.reward = step_result.reward
            new_hist_entry.action = step_result.action
            new_hist_entry.observation = step_result.observation
            new_hist_entry.register_entry(new_hist_entry, None,
                                          step_result.next_state)

            if step_result.is_terminal:
                console(2, module, "Terminated after episode step " + str(i))
                break

            console(
                2, module, "MCTS step forward took " +
                str(time.time() - start_time) + " seconds")

        self.results.time.add(time.time() - run_start_time)
        self.results.discounted_return.add(
            self.results.discounted_return.running_total)
        self.results.undiscounted_return.add(
            self.results.undiscounted_return.running_total)

        # Pretty Print results
        print_divider("large")
        solver.history.show()
        self.results.show()
        console(
            2, module, "Max possible total Un-discounted Return: " +
            str(self.model.get_max_undiscounted_return()))
        print_divider("medium")
Exemple #8
0
    def run_pomcp(self, epoch, eps):
        epoch_start = time.time()

        # Create a new solver
        solver = self.solver_factory(self)

        # Monte-Carlo start state
        state = solver.belief_tree_index.sample_particle()

        reward = 0
        discounted_reward = 0
        discount = 1.0

        for i in range(self.model.max_steps):

            start_time = time.time()

            # action will be of type Discrete Action
            action = solver.select_eps_greedy_action(eps, start_time)

            # update epsilon
            if eps > self.model.epsilon_minimum:
                eps *= self.model.epsilon_decay

            step_result, is_legal = self.model.generate_step(state, action)

            reward += step_result.reward
            discounted_reward += discount * step_result.reward

            discount *= self.model.discount
            state = step_result.next_state

            # show the step result
            self.display_step_result(i, step_result)

            if not step_result.is_terminal or not is_legal:
                solver.update(step_result)

            # Extend the history sequence
            new_hist_entry = solver.history.add_entry()
            HistoryEntry.update_history_entry(new_hist_entry,
                                              step_result.reward,
                                              step_result.action,
                                              step_result.observation,
                                              step_result.next_state)

            if step_result.is_terminal or not is_legal:
                console(3, module,
                        'Terminated after episode step ' + str(i + 1))
                break

        self.results.time.add(time.time() - epoch_start)
        self.results.update_reward_results(reward, discounted_reward)

        # Pretty Print results
        # print_divider('large')
        solver.history.show()
        self.results.show(epoch)
        console(
            3, module, 'Total possible undiscounted return: ' +
            str(self.model.get_max_undiscounted_return()))
        print_divider('medium')

        self.experiment_results.time.add(self.results.time.running_total)
        self.experiment_results.undiscounted_return.count += (
            self.results.undiscounted_return.count - 1)
        self.experiment_results.undiscounted_return.add(
            self.results.undiscounted_return.running_total)
        self.experiment_results.discounted_return.count += (
            self.results.discounted_return.count - 1)
        self.experiment_results.discounted_return.add(
            self.results.discounted_return.running_total)

        return eps
Exemple #9
0
    def run_pomcp(self, epoch, eps):
        epoch_start = time.time()

        # Create a new solver
        solver = self.solver_factory(self)

        # Monte-Carlo start state
        state = solver.belief_tree_index.sample_particle()

        # NOTE: rock example specific
        self.model.set_rock_states(state)

        reward = 0
        discounted_reward = 0
        discount = 1.0

        solver.show_current_belief()

        for i in range(self.model.max_steps):

            start_time = time.time()

            # action will be of type Discrete Action
            action = solver.select_eps_greedy_action(eps, start_time)

            # print("selected action : " + str(action.bin_number))
            # raw_input("Press Enter to continue...")

            # update epsilon
            if eps > self.model.epsilon_minimum:
                eps *= self.model.epsilon_decay

            step_result, is_legal = self.model.generate_step(state, action)

            reward += step_result.reward
            discounted_reward += discount * step_result.reward

            discount *= self.model.discount
            state = step_result.next_state

            # show the step result
            self.display_step_result(i, step_result)

            if not step_result.is_terminal or not is_legal:
                # prune the tree and augment the child belief node to proceed with enough particles that match the current (a,o)
                solver.update(step_result)
            
            solver.show_current_belief()

            # Extend the history sequence
            new_hist_entry = solver.history.add_entry()
            HistoryEntry.update_history_entry(new_hist_entry, step_result.reward,
                                              step_result.action, step_result.observation, step_result.next_state)

            if step_result.is_terminal or not is_legal:
                console(3, module, 'Terminated after episode step ' + str(i + 1))
                break

        self.results.time.add(time.time() - epoch_start)
        self.results.update_reward_results(reward, discounted_reward)

        # Pretty Print results
        # print_divider('large')
        solver.history.show()
        self.results.show(epoch)
        console(3, module, 'Total possible undiscounted return: ' + str(self.model.get_max_undiscounted_return()))
        print_divider('medium')

        self.experiment_results.time.add(self.results.time.running_total)
        self.experiment_results.undiscounted_return.count += (self.results.undiscounted_return.count - 1)
        self.experiment_results.undiscounted_return.add(self.results.undiscounted_return.running_total)
        self.experiment_results.discounted_return.count += (self.results.discounted_return.count - 1)
        self.experiment_results.discounted_return.add(self.results.discounted_return.running_total)

        return eps
Exemple #10
0
    def run_pomcp(self, epoch, eps):
        epoch_start = time.time()

        # Create a new solver, includes UCB, belief tree stuff
        solver = self.solver_factory(
            self
        )  # first build a belief tree, belief tree has many state particles

        # Monte-Carlo start state, random sample a start state
        state = solver.belief_tree_index.sample_particle()

        reward = 0
        discounted_reward = 0
        discount = 1.0

        #the process of change root of belief tree, can think of it as building history(build tree by choose action)
        for i in range(
                self.model.max_steps
        ):  #max_steps Max num of steps per trial/episode/trajectory/epoch, 200

            start_time = time.time()

            # action will be of type Discrete Action, choose action after planning
            # this is the pomcp planner (plan), for one belief state, simulate 500 times
            action = solver.select_eps_greedy_action(
                eps, start_time)  #eps is tree depth

            ########################################################################
            # update epsilon
            if eps > self.model.epsilon_minimum:
                eps *= self.model.epsilon_decay

            # this is the execution stage(act and sense),
            # choose the action and actually execute, get next state and observation
            step_result, is_legal = self.model.generate_step(state, action)

            reward += step_result.reward
            discounted_reward += discount * step_result.reward

            discount *= self.model.discount
            state = step_result.next_state

            print("inside step loop: {}".format(i))
            # show the step result
            self.display_step_result(i, step_result)

            if not step_result.is_terminal or not is_legal:
                solver.update(
                    step_result
                )  #update belief state  and prune, the belief state tree gets changed

            # Extend the history sequence
            new_hist_entry = solver.history.add_entry()
            HistoryEntry.update_history_entry(new_hist_entry,
                                              step_result.reward,
                                              step_result.action,
                                              step_result.observation,
                                              step_result.next_state)

            if step_result.is_terminal or not is_legal:
                console(3, module,
                        'Terminated after episode step ' + str(i + 1))
                break

        self.results.time.add(time.time() - epoch_start)
        self.results.update_reward_results(reward, discounted_reward)

        # Pretty Print results
        # print_divider('large')
        solver.history.show()
        self.results.show(epoch)
        console(
            3, module, 'Total possible undiscounted return: ' +
            str(self.model.get_max_undiscounted_return()))
        print_divider('medium')

        self.experiment_results.time.add(self.results.time.running_total)
        self.experiment_results.undiscounted_return.count += (
            self.results.undiscounted_return.count - 1)
        self.experiment_results.undiscounted_return.add(
            self.results.undiscounted_return.running_total)
        self.experiment_results.discounted_return.count += (
            self.results.discounted_return.count - 1)
        self.experiment_results.discounted_return.add(
            self.results.discounted_return.running_total)

        return eps